{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Text Preprocessing" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import itertools\n", "from data_describe.text.text_preprocessing import *\n", "from data_describe.misc.load_data import load_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Data" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import fetch_20newsgroups\n", "categories = ['alt.atheism']\n", "newsgroups = fetch_20newsgroups(subset='train', categories=categories)['data']" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'From: darice@yoyo.cc.monash.edu.au (Fred Rice)\\nSubject: Re: Islam & Dress Code for women\\nOrganizatio'" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "newsgroups[0][:100]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tokenize" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "newsgroups_tokens = tokenize(newsgroups)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['From',\n", " ':',\n", " 'darice',\n", " '@',\n", " 'yoyo.cc.monash.edu.au',\n", " '(',\n", " 'Fred',\n", " 'Rice',\n", " ')',\n", " 'Subject']" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "to_list(newsgroups_tokens)[0][:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Change to all lowercase" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['from',\n", " ':',\n", " 'darice',\n", " '@',\n", " 'yoyo.cc.monash.edu.au',\n", " '(',\n", " 'fred',\n", " 'rice',\n", " ')',\n", " 'subject']" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "newsgroups_tokens = tokenize(newsgroups)\n", "newsgroups_lower = to_lower(newsgroups_tokens)\n", "to_list(newsgroups_lower)[0][:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run a preprocessing pipeline in one line" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['from',\n", " ':',\n", " 'darice',\n", " '@',\n", " 'yoyo.cc.monash.edu.au',\n", " '(',\n", " 'fred',\n", " 'rice',\n", " ')',\n", " 'subject']" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'to_lower']))[0][:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Remove punctuation" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['From',\n", " 'darice',\n", " 'yoyo.cc.monash.edu.au',\n", " 'Fred',\n", " 'Rice',\n", " 'Subject',\n", " 'Re',\n", " 'Islam',\n", " 'Dress',\n", " 'Code']" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct']))[0][:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Remove digits" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['this', 'is', '', 'a', 'test', '', 'if', 'it', 'works']" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "digits_test_list = [['this', 'is', '3', 'a', 'test', '2c', 'if', 'it', 'works']]\n", "to_list(preprocess_texts(digits_test_list, custom_pipeline=['remove_digits']))[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Remove single characters and spaces" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['this', 'is', 'test']" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "single_char_spaces_test_list = [['this', 'is', ' ', 'a', 'test', ' ', 'b']]\n", "to_list(preprocess_texts(single_char_spaces_test_list, custom_pipeline=['remove_single_char_and_spaces']))[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Remove stopwords" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['From',\n", " 'darice',\n", " 'yoyo.cc.monash.edu.au',\n", " 'Fred',\n", " 'Rice',\n", " 'Subject',\n", " 'Re',\n", " 'Islam',\n", " 'Dress',\n", " 'Code']" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords']))[0][:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Stem words" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['from',\n", " 'dar',\n", " 'yoyo.cc.monash.edu.au',\n", " 'fred',\n", " 'ric',\n", " 'subject',\n", " 're',\n", " 'islam',\n", " 'dress',\n", " 'cod']" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords', 'stem']))[0][:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lemmatize words" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['From',\n", " 'darice',\n", " 'yoyo.cc.monash.edu.au',\n", " 'Fred',\n", " 'Rice',\n", " 'Subject',\n", " 'Re',\n", " 'Islam',\n", " 'Dress',\n", " 'Code']" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords', 'lemmatize']))[0][:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Custom Function" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['FROM',\n", " ':',\n", " 'DARICE',\n", " '@',\n", " 'YOYO.CC.MONASH.EDU.AU',\n", " '(',\n", " 'FRED',\n", " 'RICE',\n", " ')',\n", " 'SUBJECT']" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def shout(text_docs_bow):\n", " return ((word.upper() for word in doc) for doc in text_docs_bow)\n", "\n", "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', shout]))[0][:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert back to a single string" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"From darice yoyo.cc.monash.edu.au Fred Rice Subject Re Islam Dress Code woman Organization Monash University Melb Australia Lines In . .rz.tu-bs.de .rz.tu-bs.de Benedikt Rosenau writes In article .. .cc.monash.edu.au darice yoyo.cc.monash.edu.au Fred Rice writes Deletion Of course people say think religion exactly coming different people within religion There nothing existing different perspective within religion perhaps one say tend converge truth My point lot harm way meantime And converge counterfactual religion appear split diverge Even might True Religion core layer determine happens practise quite inhumane usually What post supposed answer I n't see got I say I repeat Religions harm people And religion converge split Giving disagree upon And lot disagreement one tolerant one tolerant Ideologies also split giving disagree upon may also lead intolerance So also oppose ideology I n't think argument argument religion point weakness human nature Big deletion Do woman soul Islam Peo\"" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "to_list(preprocess_texts(newsgroups, custom_pipeline=[\n", " 'tokenize', \n", " 'remove_punct', \n", " 'remove_stopwords', \n", " 'lemmatize',\n", " 'remove_digits',\n", " 'bag_of_words_to_docs'\n", "]))[0][:1000]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create a document-word frequency matrix" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_oaaaaaaahaapaarioaaronabandonedabberationabc...zlumberzombiezoozueszumderzurzurlozuszvonkozyklon
00000000000...0000000000
10000000000...0000000000
20000000000...0000000000
30000000000...0000000000
40000000000...0000000000
\n", "

5 rows × 10039 columns

\n", "
" ], "text/plain": [ " _o aa aaa aah aap aario aaron abandoned abberation abc ... \\\n", "0 0 0 0 0 0 0 0 0 0 0 ... \n", "1 0 0 0 0 0 0 0 0 0 0 ... \n", "2 0 0 0 0 0 0 0 0 0 0 ... \n", "3 0 0 0 0 0 0 0 0 0 0 ... \n", "4 0 0 0 0 0 0 0 0 0 0 ... \n", "\n", " zlumber zombie zoo zues zumder zur zurlo zus zvonko zyklon \n", "0 0 0 0 0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 0 0 0 0 \n", "\n", "[5 rows x 10039 columns]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "newsgroups_docs = preprocess_texts(newsgroups, custom_pipeline=[\n", " 'tokenize', \n", " 'remove_punct', \n", " 'remove_stopwords', \n", " 'lemmatize',\n", " 'remove_digits',\n", " 'bag_of_words_to_docs'\n", "])\n", "create_doc_term_matrix(newsgroups_docs).iloc[:5, 10:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create a TF-IDF matrix" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_oaaaaaaahaapaarioaaronabandonedabberationabc...zlumberzombiezoozueszumderzurzurlozuszvonkozyklon
00.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
40.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", "

5 rows × 10039 columns

\n", "
" ], "text/plain": [ " _o aa aaa aah aap aario aaron abandoned abberation abc ... \\\n", "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n", "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n", "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n", "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n", "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n", "\n", " zlumber zombie zoo zues zumder zur zurlo zus zvonko zyklon \n", "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", "[5 rows x 10039 columns]" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "newsgroups_docs = preprocess_texts(newsgroups, custom_pipeline=[\n", " 'tokenize', \n", " 'remove_punct', \n", " 'remove_stopwords', \n", " 'lemmatize',\n", " 'remove_digits',\n", " 'bag_of_words_to_docs'\n", "])\n", "create_tfidf_matrix(newsgroups_docs).iloc[:5, 10:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Ngrams Frequency" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "FreqDist({('Lines', ''): 492, ('Subject', 'Re'): 455, ('In', 'article'): 372, ('I', \"n't\"): 227, ('', 'In'): 204, ('I', 'think'): 169, ('article', ''): 151, ('article', '..'): 128, ('', ''): 126, ('writes', 'In'): 101, ...})" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "newsgroups_ngrams = preprocess_texts(newsgroups, custom_pipeline=[\n", " 'tokenize', \n", " 'remove_punct',\n", " 'remove_digits',\n", " 'remove_stopwords',\n", " 'ngram_freq'\n", "])\n", "newsgroups_ngrams" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }