{"cells": [{"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:41.313099", "end_time": "2020-10-29T03:00:41.336098", "duration": 0.022999, "status": "completed"}, "tags": []}, "source": "# Text Preprocessing"}, {"cell_type": "code", "execution_count": 1, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:41.391098Z", "iopub.status.busy": "2020-10-29T03:00:41.390134Z", "iopub.status.idle": "2020-10-29T03:00:45.233099Z", "shell.execute_reply": "2020-10-29T03:00:45.233099Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:41.360099", "end_time": "2020-10-29T03:00:45.234098", "duration": 3.873999, "status": "completed"}, "tags": []}, "outputs": [], "source": "import pandas as pd\nimport itertools\nfrom data_describe.text.text_preprocessing import *\nfrom data_describe.misc.load_data import load_data"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:45.257098", "end_time": "2020-10-29T03:00:45.280098", "duration": 0.023, "status": "completed"}, "tags": []}, "source": "## Load Data"}, {"cell_type": "code", "execution_count": 2, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:45.329125Z", "iopub.status.busy": "2020-10-29T03:00:45.328100Z", "iopub.status.idle": "2020-10-29T03:00:45.855143Z", "shell.execute_reply": "2020-10-29T03:00:45.855143Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:45.302099", "end_time": "2020-10-29T03:00:45.856143", "duration": 0.554044, "status": "completed"}, "tags": []}, "outputs": [], "source": "from sklearn.datasets import fetch_20newsgroups\ncategories = ['alt.atheism']\nnewsgroups = fetch_20newsgroups(subset='train', categories=categories)['data']"}, {"cell_type": "code", "execution_count": 3, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:45.909143Z", "iopub.status.busy": "2020-10-29T03:00:45.908180Z", "iopub.status.idle": "2020-10-29T03:00:45.913168Z", "shell.execute_reply": "2020-10-29T03:00:45.913168Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:45.877168", "end_time": "2020-10-29T03:00:45.913168", "duration": 0.036, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "'From: darice@yoyo.cc.monash.edu.au (Fred Rice)\\nSubject: Re: Islam & Dress Code for women\\nOrganizatio'"}, "execution_count": 3}], "source": "newsgroups[0][:100]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:45.936173", "end_time": "2020-10-29T03:00:45.958141", "duration": 0.021968, "status": "completed"}, "tags": []}, "source": "## Tokenize"}, {"cell_type": "code", "execution_count": 4, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:46.009143Z", "iopub.status.busy": "2020-10-29T03:00:46.008168Z", "iopub.status.idle": "2020-10-29T03:00:46.624174Z", "shell.execute_reply": "2020-10-29T03:00:46.624174Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:45.981143", "end_time": "2020-10-29T03:00:46.625174", "duration": 0.644031, "status": "completed"}, "tags": []}, "outputs": [], "source": "newsgroups_tokens = tokenize(newsgroups)"}, {"cell_type": "code", "execution_count": 5, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:46.677168Z", "iopub.status.busy": "2020-10-29T03:00:46.677168Z", "iopub.status.idle": "2020-10-29T03:00:48.956979Z", "shell.execute_reply": "2020-10-29T03:00:48.955972Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:46.648142", "end_time": "2020-10-29T03:00:48.956979", "duration": 2.308837, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['From',\n ':',\n 'darice',\n '@',\n 'yoyo.cc.monash.edu.au',\n '(',\n 'Fred',\n 'Rice',\n ')',\n 'Subject']"}, "execution_count": 5}], "source": "to_list(newsgroups_tokens)[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:48.980978", "end_time": "2020-10-29T03:00:49.004988", "duration": 0.02401, "status": "completed"}, "tags": []}, "source": "## Change to all lowercase"}, {"cell_type": "code", "execution_count": 6, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:49.107966Z", "iopub.status.busy": "2020-10-29T03:00:49.091973Z", "iopub.status.idle": "2020-10-29T03:00:51.416606Z", "shell.execute_reply": "2020-10-29T03:00:51.417571Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:49.027988", "end_time": "2020-10-29T03:00:51.417571", "duration": 2.389583, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['from',\n ':',\n 'darice',\n '@',\n 'yoyo.cc.monash.edu.au',\n '(',\n 'fred',\n 'rice',\n ')',\n 'subject']"}, "execution_count": 6}], "source": "newsgroups_tokens = tokenize(newsgroups)\nnewsgroups_lower = to_lower(newsgroups_tokens)\nto_list(newsgroups_lower)[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:51.445570", "end_time": "2020-10-29T03:00:51.470567", "duration": 0.024997, "status": "completed"}, "tags": []}, "source": "## Run a preprocessing pipeline in one line"}, {"cell_type": "code", "execution_count": 7, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:51.548604Z", "iopub.status.busy": "2020-10-29T03:00:51.542569Z", "iopub.status.idle": "2020-10-29T03:00:53.863233Z", "shell.execute_reply": "2020-10-29T03:00:53.862234Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:51.493569", "end_time": "2020-10-29T03:00:53.864232", "duration": 2.370663, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['from',\n ':',\n 'darice',\n '@',\n 'yoyo.cc.monash.edu.au',\n '(',\n 'fred',\n 'rice',\n ')',\n 'subject']"}, "execution_count": 7}], "source": "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'to_lower']))[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:53.888196", "end_time": "2020-10-29T03:00:53.912227", "duration": 0.024031, "status": "completed"}, "tags": []}, "source": "## Remove punctuation"}, {"cell_type": "code", "execution_count": 8, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:53.989232Z", "iopub.status.busy": "2020-10-29T03:00:53.972202Z", "iopub.status.idle": "2020-10-29T03:00:57.505239Z", "shell.execute_reply": "2020-10-29T03:00:57.506240Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:53.935226", "end_time": "2020-10-29T03:00:57.506240", "duration": 3.571014, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['From',\n 'darice',\n 'yoyo.cc.monash.edu.au',\n 'Fred',\n 'Rice',\n 'Subject',\n 'Re',\n 'Islam',\n 'Dress',\n 'Code']"}, "execution_count": 8}], "source": "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct']))[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:57.531266", "end_time": "2020-10-29T03:00:57.556239", "duration": 0.024973, "status": "completed"}, "tags": []}, "source": "## Remove digits"}, {"cell_type": "code", "execution_count": 9, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:57.611268Z", "iopub.status.busy": "2020-10-29T03:00:57.610273Z", "iopub.status.idle": "2020-10-29T03:00:57.614242Z", "shell.execute_reply": "2020-10-29T03:00:57.615271Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:57.580240", "end_time": "2020-10-29T03:00:57.615271", "duration": 0.035031, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['this', 'is', '', 'a', 'test', '', 'if', 'it', 'works']"}, "execution_count": 9}], "source": "digits_test_list = [['this', 'is', '3', 'a', 'test', '2c', 'if', 'it', 'works']]\nto_list(preprocess_texts(digits_test_list, custom_pipeline=['remove_digits']))[0]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:57.640276", "end_time": "2020-10-29T03:00:57.667245", "duration": 0.026969, "status": "completed"}, "tags": []}, "source": "## Remove single characters and spaces"}, {"cell_type": "code", "execution_count": 10, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:57.728280Z", "iopub.status.busy": "2020-10-29T03:00:57.728280Z", "iopub.status.idle": "2020-10-29T03:00:57.732275Z", "shell.execute_reply": "2020-10-29T03:00:57.733268Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:57.697293", "end_time": "2020-10-29T03:00:57.733268", "duration": 0.035975, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['this', 'is', 'test']"}, "execution_count": 10}], "source": "single_char_spaces_test_list = [['this', 'is', ' ', 'a', 'test', ' ', 'b']]\nto_list(preprocess_texts(single_char_spaces_test_list, custom_pipeline=['remove_single_char_and_spaces']))[0]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:57.758273", "end_time": "2020-10-29T03:00:57.785274", "duration": 0.027001, "status": "completed"}, "tags": []}, "source": "## Remove stopwords"}, {"cell_type": "code", "execution_count": 11, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:57.842240Z", "iopub.status.busy": "2020-10-29T03:00:57.841240Z", "iopub.status.idle": "2020-10-29T03:01:01.482073Z", "shell.execute_reply": "2020-10-29T03:01:01.481073Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:57.810275", "end_time": "2020-10-29T03:01:01.483073", "duration": 3.672798, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['From',\n 'darice',\n 'yoyo.cc.monash.edu.au',\n 'Fred',\n 'Rice',\n 'Subject',\n 'Re',\n 'Islam',\n 'Dress',\n 'Code']"}, "execution_count": 11}], "source": "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords']))[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:01:01.509072", "end_time": "2020-10-29T03:01:01.536075", "duration": 0.027003, "status": "completed"}, "tags": []}, "source": "## Stem words"}, {"cell_type": "code", "execution_count": 12, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:01:01.600075Z", "iopub.status.busy": "2020-10-29T03:01:01.598073Z", "iopub.status.idle": "2020-10-29T03:01:08.163872Z", "shell.execute_reply": "2020-10-29T03:01:08.163872Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:01:01.565073", "end_time": "2020-10-29T03:01:08.164895", "duration": 6.599822, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['from',\n 'dar',\n 'yoyo.cc.monash.edu.au',\n 'fred',\n 'ric',\n 'subject',\n 're',\n 'islam',\n 'dress',\n 'cod']"}, "execution_count": 12}], "source": "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords', 'stem']))[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:01:08.195870", "end_time": "2020-10-29T03:01:08.221866", "duration": 0.025996, "status": "completed"}, "tags": []}, "source": "## Lemmatize words"}, {"cell_type": "code", "execution_count": 13, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:01:08.278865Z", "iopub.status.busy": "2020-10-29T03:01:08.277866Z", "iopub.status.idle": "2020-10-29T03:01:14.839973Z", "shell.execute_reply": "2020-10-29T03:01:14.840998Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:01:08.246894", "end_time": "2020-10-29T03:01:14.840998", "duration": 6.594104, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['From',\n 'darice',\n 'yoyo.cc.monash.edu.au',\n 'Fred',\n 'Rice',\n 'Subject',\n 'Re',\n 'Islam',\n 'Dress',\n 'Code']"}, "execution_count": 13}], "source": "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords', 'lemmatize']))[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:01:14.867980", "end_time": "2020-10-29T03:01:14.894969", "duration": 0.026989, "status": "completed"}, "tags": []}, "source": "## Custom Function"}, {"cell_type": "code", "execution_count": 14, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:01:14.977004Z", "iopub.status.busy": "2020-10-29T03:01:14.955004Z", "iopub.status.idle": "2020-10-29T03:01:17.439014Z", "shell.execute_reply": "2020-10-29T03:01:17.440015Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:01:14.919970", "end_time": "2020-10-29T03:01:17.440015", "duration": 2.520045, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['FROM',\n ':',\n 'DARICE',\n '@',\n 'YOYO.CC.MONASH.EDU.AU',\n '(',\n 'FRED',\n 'RICE',\n ')',\n 'SUBJECT']"}, "execution_count": 14}], "source": "def shout(text_docs_bow):\n return ((word.upper() for word in doc) for doc in text_docs_bow)\n\nto_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', shout]))[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:01:17.476015", "end_time": "2020-10-29T03:01:17.508015", "duration": 0.032, "status": "completed"}, "tags": []}, "source": "## Convert back to a single string"}, {"cell_type": "code", "execution_count": 15, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:01:17.577041Z", "iopub.status.busy": "2020-10-29T03:01:17.575049Z", "iopub.status.idle": "2020-10-29T03:01:22.116117Z", "shell.execute_reply": "2020-10-29T03:01:22.115123Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:01:17.540015", "end_time": "2020-10-29T03:01:22.116117", "duration": 4.576102, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "\"From darice yoyo.cc.monash.edu.au Fred Rice Subject Re Islam Dress Code woman Organization Monash University Melb Australia Lines In . .rz.tu-bs.de .rz.tu-bs.de Benedikt Rosenau writes In article .. .cc.monash.edu.au darice yoyo.cc.monash.edu.au Fred Rice writes Deletion Of course people say think religion exactly coming different people within religion There nothing existing different perspective within religion perhaps one say tend converge truth My point lot harm way meantime And converge counterfactual religion appear split diverge Even might True Religion core layer determine happens practise quite inhumane usually What post supposed answer I n't see got I say I repeat Religions harm people And religion converge split Giving disagree upon And lot disagreement one tolerant one tolerant Ideologies also split giving disagree upon may also lead intolerance So also oppose ideology I n't think argument argument religion point weakness human nature Big deletion Do woman soul Islam Peo\""}, "execution_count": 15}], "source": "to_list(preprocess_texts(newsgroups, custom_pipeline=[\n 'tokenize', \n 'remove_punct', \n 'remove_stopwords', \n 'lemmatize',\n 'remove_digits',\n 'bag_of_words_to_docs'\n]))[0][:1000]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:01:22.144092", "end_time": "2020-10-29T03:01:22.173127", "duration": 0.029035, "status": "completed"}, "tags": []}, "source": "## Create a document-word frequency matrix"}, {"cell_type": "code", "execution_count": 16, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:01:22.231090Z", "iopub.status.busy": "2020-10-29T03:01:22.230090Z", "iopub.status.idle": "2020-10-29T03:01:26.927102Z", "shell.execute_reply": "2020-10-29T03:01:26.927102Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:01:22.199126", "end_time": "2020-10-29T03:01:26.927102", "duration": 4.727976, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": " _o aa aaa aah aap aario aaron abandoned abberation abc ... \\\n0 0 0 0 0 0 0 0 0 0 0 ... \n1 0 0 0 0 0 0 0 0 0 0 ... \n2 0 0 0 0 0 0 0 0 0 0 ... \n3 0 0 0 0 0 0 0 0 0 0 ... \n4 0 0 0 0 0 0 0 0 0 0 ... \n\n zlumber zombie zoo zues zumder zur zurlo zus zvonko zyklon \n0 0 0 0 0 0 0 0 0 0 0 \n1 0 0 0 0 0 0 0 0 0 0 \n2 0 0 0 0 0 0 0 0 0 0 \n3 0 0 0 0 0 0 0 0 0 0 \n4 0 0 0 0 0 0 0 0 0 0 \n\n[5 rows x 10039 columns]", "text/html": "
\n | _o | \naa | \naaa | \naah | \naap | \naario | \naaron | \nabandoned | \nabberation | \nabc | \n... | \nzlumber | \nzombie | \nzoo | \nzues | \nzumder | \nzur | \nzurlo | \nzus | \nzvonko | \nzyklon | \n
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
1 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
2 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
3 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
4 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n... | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n0 | \n
5 rows \u00d7 10039 columns
\n\n | _o | \naa | \naaa | \naah | \naap | \naario | \naaron | \nabandoned | \nabberation | \nabc | \n... | \nzlumber | \nzombie | \nzoo | \nzues | \nzumder | \nzur | \nzurlo | \nzus | \nzvonko | \nzyklon | \n
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n... | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n
1 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n... | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n
2 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n... | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n
3 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n... | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n
4 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n... | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n0.0 | \n
5 rows \u00d7 10039 columns
\n