{"cells": [{"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:41.313099", "end_time": "2020-10-29T03:00:41.336098", "duration": 0.022999, "status": "completed"}, "tags": []}, "source": "# Text Preprocessing"}, {"cell_type": "code", "execution_count": 1, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:41.391098Z", "iopub.status.busy": "2020-10-29T03:00:41.390134Z", "iopub.status.idle": "2020-10-29T03:00:45.233099Z", "shell.execute_reply": "2020-10-29T03:00:45.233099Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:41.360099", "end_time": "2020-10-29T03:00:45.234098", "duration": 3.873999, "status": "completed"}, "tags": []}, "outputs": [], "source": "import pandas as pd\nimport itertools\nfrom data_describe.text.text_preprocessing import *\nfrom data_describe.misc.load_data import load_data"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:45.257098", "end_time": "2020-10-29T03:00:45.280098", "duration": 0.023, "status": "completed"}, "tags": []}, "source": "## Load Data"}, {"cell_type": "code", "execution_count": 2, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:45.329125Z", "iopub.status.busy": "2020-10-29T03:00:45.328100Z", "iopub.status.idle": "2020-10-29T03:00:45.855143Z", "shell.execute_reply": "2020-10-29T03:00:45.855143Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:45.302099", "end_time": "2020-10-29T03:00:45.856143", "duration": 0.554044, "status": "completed"}, "tags": []}, "outputs": [], "source": "from sklearn.datasets import fetch_20newsgroups\ncategories = ['alt.atheism']\nnewsgroups = fetch_20newsgroups(subset='train', categories=categories)['data']"}, {"cell_type": "code", "execution_count": 3, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:45.909143Z", "iopub.status.busy": "2020-10-29T03:00:45.908180Z", "iopub.status.idle": "2020-10-29T03:00:45.913168Z", "shell.execute_reply": "2020-10-29T03:00:45.913168Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:45.877168", "end_time": "2020-10-29T03:00:45.913168", "duration": 0.036, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "'From: darice@yoyo.cc.monash.edu.au (Fred Rice)\\nSubject: Re: Islam & Dress Code for women\\nOrganizatio'"}, "execution_count": 3}], "source": "newsgroups[0][:100]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:45.936173", "end_time": "2020-10-29T03:00:45.958141", "duration": 0.021968, "status": "completed"}, "tags": []}, "source": "## Tokenize"}, {"cell_type": "code", "execution_count": 4, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:46.009143Z", "iopub.status.busy": "2020-10-29T03:00:46.008168Z", "iopub.status.idle": "2020-10-29T03:00:46.624174Z", "shell.execute_reply": "2020-10-29T03:00:46.624174Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:45.981143", "end_time": "2020-10-29T03:00:46.625174", "duration": 0.644031, "status": "completed"}, "tags": []}, "outputs": [], "source": "newsgroups_tokens = tokenize(newsgroups)"}, {"cell_type": "code", "execution_count": 5, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:46.677168Z", "iopub.status.busy": "2020-10-29T03:00:46.677168Z", "iopub.status.idle": "2020-10-29T03:00:48.956979Z", "shell.execute_reply": "2020-10-29T03:00:48.955972Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:46.648142", "end_time": "2020-10-29T03:00:48.956979", "duration": 2.308837, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['From',\n ':',\n 'darice',\n '@',\n 'yoyo.cc.monash.edu.au',\n '(',\n 'Fred',\n 'Rice',\n ')',\n 'Subject']"}, "execution_count": 5}], "source": "to_list(newsgroups_tokens)[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:48.980978", "end_time": "2020-10-29T03:00:49.004988", "duration": 0.02401, "status": "completed"}, "tags": []}, "source": "## Change to all lowercase"}, {"cell_type": "code", "execution_count": 6, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:49.107966Z", "iopub.status.busy": "2020-10-29T03:00:49.091973Z", "iopub.status.idle": "2020-10-29T03:00:51.416606Z", "shell.execute_reply": "2020-10-29T03:00:51.417571Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:49.027988", "end_time": "2020-10-29T03:00:51.417571", "duration": 2.389583, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['from',\n ':',\n 'darice',\n '@',\n 'yoyo.cc.monash.edu.au',\n '(',\n 'fred',\n 'rice',\n ')',\n 'subject']"}, "execution_count": 6}], "source": "newsgroups_tokens = tokenize(newsgroups)\nnewsgroups_lower = to_lower(newsgroups_tokens)\nto_list(newsgroups_lower)[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:51.445570", "end_time": "2020-10-29T03:00:51.470567", "duration": 0.024997, "status": "completed"}, "tags": []}, "source": "## Run a preprocessing pipeline in one line"}, {"cell_type": "code", "execution_count": 7, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:51.548604Z", "iopub.status.busy": "2020-10-29T03:00:51.542569Z", "iopub.status.idle": "2020-10-29T03:00:53.863233Z", "shell.execute_reply": "2020-10-29T03:00:53.862234Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:51.493569", "end_time": "2020-10-29T03:00:53.864232", "duration": 2.370663, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['from',\n ':',\n 'darice',\n '@',\n 'yoyo.cc.monash.edu.au',\n '(',\n 'fred',\n 'rice',\n ')',\n 'subject']"}, "execution_count": 7}], "source": "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'to_lower']))[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:53.888196", "end_time": "2020-10-29T03:00:53.912227", "duration": 0.024031, "status": "completed"}, "tags": []}, "source": "## Remove punctuation"}, {"cell_type": "code", "execution_count": 8, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:53.989232Z", "iopub.status.busy": "2020-10-29T03:00:53.972202Z", "iopub.status.idle": "2020-10-29T03:00:57.505239Z", "shell.execute_reply": "2020-10-29T03:00:57.506240Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:53.935226", "end_time": "2020-10-29T03:00:57.506240", "duration": 3.571014, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['From',\n 'darice',\n 'yoyo.cc.monash.edu.au',\n 'Fred',\n 'Rice',\n 'Subject',\n 'Re',\n 'Islam',\n 'Dress',\n 'Code']"}, "execution_count": 8}], "source": "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct']))[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:57.531266", "end_time": "2020-10-29T03:00:57.556239", "duration": 0.024973, "status": "completed"}, "tags": []}, "source": "## Remove digits"}, {"cell_type": "code", "execution_count": 9, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:57.611268Z", "iopub.status.busy": "2020-10-29T03:00:57.610273Z", "iopub.status.idle": "2020-10-29T03:00:57.614242Z", "shell.execute_reply": "2020-10-29T03:00:57.615271Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:57.580240", "end_time": "2020-10-29T03:00:57.615271", "duration": 0.035031, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['this', 'is', '', 'a', 'test', '', 'if', 'it', 'works']"}, "execution_count": 9}], "source": "digits_test_list = [['this', 'is', '3', 'a', 'test', '2c', 'if', 'it', 'works']]\nto_list(preprocess_texts(digits_test_list, custom_pipeline=['remove_digits']))[0]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:57.640276", "end_time": "2020-10-29T03:00:57.667245", "duration": 0.026969, "status": "completed"}, "tags": []}, "source": "## Remove single characters and spaces"}, {"cell_type": "code", "execution_count": 10, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:57.728280Z", "iopub.status.busy": "2020-10-29T03:00:57.728280Z", "iopub.status.idle": "2020-10-29T03:00:57.732275Z", "shell.execute_reply": "2020-10-29T03:00:57.733268Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:57.697293", "end_time": "2020-10-29T03:00:57.733268", "duration": 0.035975, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['this', 'is', 'test']"}, "execution_count": 10}], "source": "single_char_spaces_test_list = [['this', 'is', ' ', 'a', 'test', ' ', 'b']]\nto_list(preprocess_texts(single_char_spaces_test_list, custom_pipeline=['remove_single_char_and_spaces']))[0]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:57.758273", "end_time": "2020-10-29T03:00:57.785274", "duration": 0.027001, "status": "completed"}, "tags": []}, "source": "## Remove stopwords"}, {"cell_type": "code", "execution_count": 11, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:57.842240Z", "iopub.status.busy": "2020-10-29T03:00:57.841240Z", "iopub.status.idle": "2020-10-29T03:01:01.482073Z", "shell.execute_reply": "2020-10-29T03:01:01.481073Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:57.810275", "end_time": "2020-10-29T03:01:01.483073", "duration": 3.672798, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['From',\n 'darice',\n 'yoyo.cc.monash.edu.au',\n 'Fred',\n 'Rice',\n 'Subject',\n 'Re',\n 'Islam',\n 'Dress',\n 'Code']"}, "execution_count": 11}], "source": "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords']))[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:01:01.509072", "end_time": "2020-10-29T03:01:01.536075", "duration": 0.027003, "status": "completed"}, "tags": []}, "source": "## Stem words"}, {"cell_type": "code", "execution_count": 12, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:01:01.600075Z", "iopub.status.busy": "2020-10-29T03:01:01.598073Z", "iopub.status.idle": "2020-10-29T03:01:08.163872Z", "shell.execute_reply": "2020-10-29T03:01:08.163872Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:01:01.565073", "end_time": "2020-10-29T03:01:08.164895", "duration": 6.599822, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['from',\n 'dar',\n 'yoyo.cc.monash.edu.au',\n 'fred',\n 'ric',\n 'subject',\n 're',\n 'islam',\n 'dress',\n 'cod']"}, "execution_count": 12}], "source": "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords', 'stem']))[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:01:08.195870", "end_time": "2020-10-29T03:01:08.221866", "duration": 0.025996, "status": "completed"}, "tags": []}, "source": "## Lemmatize words"}, {"cell_type": "code", "execution_count": 13, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:01:08.278865Z", "iopub.status.busy": "2020-10-29T03:01:08.277866Z", "iopub.status.idle": "2020-10-29T03:01:14.839973Z", "shell.execute_reply": "2020-10-29T03:01:14.840998Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:01:08.246894", "end_time": "2020-10-29T03:01:14.840998", "duration": 6.594104, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['From',\n 'darice',\n 'yoyo.cc.monash.edu.au',\n 'Fred',\n 'Rice',\n 'Subject',\n 'Re',\n 'Islam',\n 'Dress',\n 'Code']"}, "execution_count": 13}], "source": "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords', 'lemmatize']))[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:01:14.867980", "end_time": "2020-10-29T03:01:14.894969", "duration": 0.026989, "status": "completed"}, "tags": []}, "source": "## Custom Function"}, {"cell_type": "code", "execution_count": 14, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:01:14.977004Z", "iopub.status.busy": "2020-10-29T03:01:14.955004Z", "iopub.status.idle": "2020-10-29T03:01:17.439014Z", "shell.execute_reply": "2020-10-29T03:01:17.440015Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:01:14.919970", "end_time": "2020-10-29T03:01:17.440015", "duration": 2.520045, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "['FROM',\n ':',\n 'DARICE',\n '@',\n 'YOYO.CC.MONASH.EDU.AU',\n '(',\n 'FRED',\n 'RICE',\n ')',\n 'SUBJECT']"}, "execution_count": 14}], "source": "def shout(text_docs_bow):\n return ((word.upper() for word in doc) for doc in text_docs_bow)\n\nto_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', shout]))[0][:10]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:01:17.476015", "end_time": "2020-10-29T03:01:17.508015", "duration": 0.032, "status": "completed"}, "tags": []}, "source": "## Convert back to a single string"}, {"cell_type": "code", "execution_count": 15, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:01:17.577041Z", "iopub.status.busy": "2020-10-29T03:01:17.575049Z", "iopub.status.idle": "2020-10-29T03:01:22.116117Z", "shell.execute_reply": "2020-10-29T03:01:22.115123Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:01:17.540015", "end_time": "2020-10-29T03:01:22.116117", "duration": 4.576102, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "\"From darice yoyo.cc.monash.edu.au Fred Rice Subject Re Islam Dress Code woman Organization Monash University Melb Australia Lines In . .rz.tu-bs.de .rz.tu-bs.de Benedikt Rosenau writes In article .. .cc.monash.edu.au darice yoyo.cc.monash.edu.au Fred Rice writes Deletion Of course people say think religion exactly coming different people within religion There nothing existing different perspective within religion perhaps one say tend converge truth My point lot harm way meantime And converge counterfactual religion appear split diverge Even might True Religion core layer determine happens practise quite inhumane usually What post supposed answer I n't see got I say I repeat Religions harm people And religion converge split Giving disagree upon And lot disagreement one tolerant one tolerant Ideologies also split giving disagree upon may also lead intolerance So also oppose ideology I n't think argument argument religion point weakness human nature Big deletion Do woman soul Islam Peo\""}, "execution_count": 15}], "source": "to_list(preprocess_texts(newsgroups, custom_pipeline=[\n 'tokenize', \n 'remove_punct', \n 'remove_stopwords', \n 'lemmatize',\n 'remove_digits',\n 'bag_of_words_to_docs'\n]))[0][:1000]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:01:22.144092", "end_time": "2020-10-29T03:01:22.173127", "duration": 0.029035, "status": "completed"}, "tags": []}, "source": "## Create a document-word frequency matrix"}, {"cell_type": "code", "execution_count": 16, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:01:22.231090Z", "iopub.status.busy": "2020-10-29T03:01:22.230090Z", "iopub.status.idle": "2020-10-29T03:01:26.927102Z", "shell.execute_reply": "2020-10-29T03:01:26.927102Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:01:22.199126", "end_time": "2020-10-29T03:01:26.927102", "duration": 4.727976, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": " _o aa aaa aah aap aario aaron abandoned abberation abc ... \\\n0 0 0 0 0 0 0 0 0 0 0 ... \n1 0 0 0 0 0 0 0 0 0 0 ... \n2 0 0 0 0 0 0 0 0 0 0 ... \n3 0 0 0 0 0 0 0 0 0 0 ... \n4 0 0 0 0 0 0 0 0 0 0 ... \n\n zlumber zombie zoo zues zumder zur zurlo zus zvonko zyklon \n0 0 0 0 0 0 0 0 0 0 0 \n1 0 0 0 0 0 0 0 0 0 0 \n2 0 0 0 0 0 0 0 0 0 0 \n3 0 0 0 0 0 0 0 0 0 0 \n4 0 0 0 0 0 0 0 0 0 0 \n\n[5 rows x 10039 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
_oaaaaaaahaapaarioaaronabandonedabberationabc...zlumberzombiezoozueszumderzurzurlozuszvonkozyklon
00000000000...0000000000
10000000000...0000000000
20000000000...0000000000
30000000000...0000000000
40000000000...0000000000
\n

5 rows \u00d7 10039 columns

\n
"}, "execution_count": 16}], "source": "newsgroups_docs = preprocess_texts(newsgroups, custom_pipeline=[\n 'tokenize', \n 'remove_punct', \n 'remove_stopwords', \n 'lemmatize',\n 'remove_digits',\n 'bag_of_words_to_docs'\n])\ncreate_doc_term_matrix(newsgroups_docs).iloc[:5, 10:]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:01:26.955074", "end_time": "2020-10-29T03:01:26.983102", "duration": 0.028028, "status": "completed"}, "tags": []}, "source": "## Create a TF-IDF matrix"}, {"cell_type": "code", "execution_count": 17, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:01:27.044111Z", "iopub.status.busy": "2020-10-29T03:01:27.044111Z", "iopub.status.idle": "2020-10-29T03:01:32.965454Z", "shell.execute_reply": "2020-10-29T03:01:32.964456Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:01:27.011104", "end_time": "2020-10-29T03:01:32.965454", "duration": 5.95435, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": " _o aa aaa aah aap aario aaron abandoned abberation abc ... \\\n0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n\n zlumber zombie zoo zues zumder zur zurlo zus zvonko zyklon \n0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n\n[5 rows x 10039 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
_oaaaaaaahaapaarioaaronabandonedabberationabc...zlumberzombiezoozueszumderzurzurlozuszvonkozyklon
00.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
40.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n

5 rows \u00d7 10039 columns

\n
"}, "execution_count": 17}], "source": "newsgroups_docs = preprocess_texts(newsgroups, custom_pipeline=[\n 'tokenize', \n 'remove_punct', \n 'remove_stopwords', \n 'lemmatize',\n 'remove_digits',\n 'bag_of_words_to_docs'\n])\ncreate_tfidf_matrix(newsgroups_docs).iloc[:5, 10:]"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:01:32.994422", "end_time": "2020-10-29T03:01:33.023451", "duration": 0.029029, "status": "completed"}, "tags": []}, "source": "## Ngrams Frequency"}, {"cell_type": "code", "execution_count": 18, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:01:33.083456Z", "iopub.status.busy": "2020-10-29T03:01:33.083456Z", "iopub.status.idle": "2020-10-29T03:01:37.303811Z", "shell.execute_reply": "2020-10-29T03:01:37.304836Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:01:33.050448", "end_time": "2020-10-29T03:01:37.305524", "duration": 4.255076, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "FreqDist({('Lines', ''): 492, ('Subject', 'Re'): 455, ('In', 'article'): 372, ('I', \"n't\"): 227, ('', 'In'): 204, ('I', 'think'): 169, ('article', ''): 151, ('article', '..'): 128, ('', ''): 126, ('writes', 'In'): 101, ...})"}, "execution_count": 18}], "source": "newsgroups_ngrams = preprocess_texts(newsgroups, custom_pipeline=[\n 'tokenize', \n 'remove_punct',\n 'remove_digits',\n 'remove_stopwords',\n 'ngram_freq'\n])\nnewsgroups_ngrams"}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"name": "python", "version": "3.7.9", "mimetype": "text/x-python", "codemirror_mode": {"name": "ipython", "version": 3}, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py"}, "papermill": {"default_parameters": {}, "duration": 59.060682, "end_time": "2020-10-29T03:01:38.052812", "environment_variables": {}, "exception": null, "input_path": "C:\\workspace\\data-describe\\examples\\Text_Preprocessing.ipynb", "output_path": "C:\\workspace\\data-describe\\examples\\Text_Preprocessing.ipynb", "parameters": {}, "start_time": "2020-10-29T03:00:38.992130", "version": "2.2.0"}}, "nbformat": 4, "nbformat_minor": 4}