{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Text Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import itertools\n",
    "from data_describe.text.text_preprocessing import *\n",
    "from data_describe.misc.load_data import load_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import fetch_20newsgroups\n",
    "categories = ['alt.atheism']\n",
    "newsgroups = fetch_20newsgroups(subset='train', categories=categories)['data']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'From: darice@yoyo.cc.monash.edu.au (Fred Rice)\\nSubject: Re: Islam & Dress Code for women\\nOrganizatio'"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newsgroups[0][:100]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "newsgroups_tokens = tokenize(newsgroups)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['From',\n",
       " ':',\n",
       " 'darice',\n",
       " '@',\n",
       " 'yoyo.cc.monash.edu.au',\n",
       " '(',\n",
       " 'Fred',\n",
       " 'Rice',\n",
       " ')',\n",
       " 'Subject']"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "to_list(newsgroups_tokens)[0][:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Change to all lowercase"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['from',\n",
       " ':',\n",
       " 'darice',\n",
       " '@',\n",
       " 'yoyo.cc.monash.edu.au',\n",
       " '(',\n",
       " 'fred',\n",
       " 'rice',\n",
       " ')',\n",
       " 'subject']"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newsgroups_tokens = tokenize(newsgroups)\n",
    "newsgroups_lower = to_lower(newsgroups_tokens)\n",
    "to_list(newsgroups_lower)[0][:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run a preprocessing pipeline in one line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['from',\n",
       " ':',\n",
       " 'darice',\n",
       " '@',\n",
       " 'yoyo.cc.monash.edu.au',\n",
       " '(',\n",
       " 'fred',\n",
       " 'rice',\n",
       " ')',\n",
       " 'subject']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'to_lower']))[0][:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Remove punctuation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['From',\n",
       " 'darice',\n",
       " 'yoyo.cc.monash.edu.au',\n",
       " 'Fred',\n",
       " 'Rice',\n",
       " 'Subject',\n",
       " 'Re',\n",
       " 'Islam',\n",
       " 'Dress',\n",
       " 'Code']"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct']))[0][:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Remove digits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['this', 'is', '', 'a', 'test', '', 'if', 'it', 'works']"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "digits_test_list = [['this', 'is', '3', 'a', 'test', '2c', 'if', 'it', 'works']]\n",
    "to_list(preprocess_texts(digits_test_list, custom_pipeline=['remove_digits']))[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Remove single characters and spaces"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['this', 'is', 'test']"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "single_char_spaces_test_list = [['this', 'is', '   ', 'a', 'test', '   ', 'b']]\n",
    "to_list(preprocess_texts(single_char_spaces_test_list, custom_pipeline=['remove_single_char_and_spaces']))[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Remove stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['From',\n",
       " 'darice',\n",
       " 'yoyo.cc.monash.edu.au',\n",
       " 'Fred',\n",
       " 'Rice',\n",
       " 'Subject',\n",
       " 'Re',\n",
       " 'Islam',\n",
       " 'Dress',\n",
       " 'Code']"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords']))[0][:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Stem words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['from',\n",
       " 'dar',\n",
       " 'yoyo.cc.monash.edu.au',\n",
       " 'fred',\n",
       " 'ric',\n",
       " 'subject',\n",
       " 're',\n",
       " 'islam',\n",
       " 'dress',\n",
       " 'cod']"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords', 'stem']))[0][:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lemmatize words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['From',\n",
       " 'darice',\n",
       " 'yoyo.cc.monash.edu.au',\n",
       " 'Fred',\n",
       " 'Rice',\n",
       " 'Subject',\n",
       " 'Re',\n",
       " 'Islam',\n",
       " 'Dress',\n",
       " 'Code']"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords', 'lemmatize']))[0][:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Custom Function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['FROM',\n",
       " ':',\n",
       " 'DARICE',\n",
       " '@',\n",
       " 'YOYO.CC.MONASH.EDU.AU',\n",
       " '(',\n",
       " 'FRED',\n",
       " 'RICE',\n",
       " ')',\n",
       " 'SUBJECT']"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def shout(text_docs_bow):\n",
    "    return ((word.upper() for word in doc) for doc in text_docs_bow)\n",
    "\n",
    "to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', shout]))[0][:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert back to a single string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"From darice yoyo.cc.monash.edu.au Fred Rice Subject Re Islam Dress Code woman Organization Monash University Melb Australia Lines  In . .rz.tu-bs.de  .rz.tu-bs.de Benedikt Rosenau writes In article .. .cc.monash.edu.au darice yoyo.cc.monash.edu.au Fred Rice writes Deletion Of course people say think religion exactly coming different people within religion There nothing existing different perspective within religion perhaps one say tend converge truth My point lot harm way meantime And converge counterfactual religion appear split diverge Even might True Religion core layer determine happens practise quite inhumane usually What post supposed answer I n't see got I say I repeat Religions harm people And religion converge split Giving disagree upon And lot disagreement one tolerant one tolerant Ideologies also split giving disagree upon may also lead intolerance So also oppose ideology I n't think argument argument religion point weakness human nature Big deletion  Do woman soul Islam Peo\""
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "to_list(preprocess_texts(newsgroups, custom_pipeline=[\n",
    "    'tokenize', \n",
    "    'remove_punct', \n",
    "    'remove_stopwords', \n",
    "    'lemmatize',\n",
    "    'remove_digits',\n",
    "    'bag_of_words_to_docs'\n",
    "]))[0][:1000]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a document-word frequency matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>_o</th>\n",
       "      <th>aa</th>\n",
       "      <th>aaa</th>\n",
       "      <th>aah</th>\n",
       "      <th>aap</th>\n",
       "      <th>aario</th>\n",
       "      <th>aaron</th>\n",
       "      <th>abandoned</th>\n",
       "      <th>abberation</th>\n",
       "      <th>abc</th>\n",
       "      <th>...</th>\n",
       "      <th>zlumber</th>\n",
       "      <th>zombie</th>\n",
       "      <th>zoo</th>\n",
       "      <th>zues</th>\n",
       "      <th>zumder</th>\n",
       "      <th>zur</th>\n",
       "      <th>zurlo</th>\n",
       "      <th>zus</th>\n",
       "      <th>zvonko</th>\n",
       "      <th>zyklon</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 10039 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   _o  aa  aaa  aah  aap  aario  aaron  abandoned  abberation  abc  ...  \\\n",
       "0   0   0    0    0    0      0      0          0           0    0  ...   \n",
       "1   0   0    0    0    0      0      0          0           0    0  ...   \n",
       "2   0   0    0    0    0      0      0          0           0    0  ...   \n",
       "3   0   0    0    0    0      0      0          0           0    0  ...   \n",
       "4   0   0    0    0    0      0      0          0           0    0  ...   \n",
       "\n",
       "   zlumber  zombie  zoo  zues  zumder  zur  zurlo  zus  zvonko  zyklon  \n",
       "0        0       0    0     0       0    0      0    0       0       0  \n",
       "1        0       0    0     0       0    0      0    0       0       0  \n",
       "2        0       0    0     0       0    0      0    0       0       0  \n",
       "3        0       0    0     0       0    0      0    0       0       0  \n",
       "4        0       0    0     0       0    0      0    0       0       0  \n",
       "\n",
       "[5 rows x 10039 columns]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newsgroups_docs = preprocess_texts(newsgroups, custom_pipeline=[\n",
    "    'tokenize', \n",
    "    'remove_punct', \n",
    "    'remove_stopwords', \n",
    "    'lemmatize',\n",
    "    'remove_digits',\n",
    "    'bag_of_words_to_docs'\n",
    "])\n",
    "create_doc_term_matrix(newsgroups_docs).iloc[:5, 10:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a TF-IDF matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>_o</th>\n",
       "      <th>aa</th>\n",
       "      <th>aaa</th>\n",
       "      <th>aah</th>\n",
       "      <th>aap</th>\n",
       "      <th>aario</th>\n",
       "      <th>aaron</th>\n",
       "      <th>abandoned</th>\n",
       "      <th>abberation</th>\n",
       "      <th>abc</th>\n",
       "      <th>...</th>\n",
       "      <th>zlumber</th>\n",
       "      <th>zombie</th>\n",
       "      <th>zoo</th>\n",
       "      <th>zues</th>\n",
       "      <th>zumder</th>\n",
       "      <th>zur</th>\n",
       "      <th>zurlo</th>\n",
       "      <th>zus</th>\n",
       "      <th>zvonko</th>\n",
       "      <th>zyklon</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 10039 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    _o   aa  aaa  aah  aap  aario  aaron  abandoned  abberation  abc  ...  \\\n",
       "0  0.0  0.0  0.0  0.0  0.0    0.0    0.0        0.0         0.0  0.0  ...   \n",
       "1  0.0  0.0  0.0  0.0  0.0    0.0    0.0        0.0         0.0  0.0  ...   \n",
       "2  0.0  0.0  0.0  0.0  0.0    0.0    0.0        0.0         0.0  0.0  ...   \n",
       "3  0.0  0.0  0.0  0.0  0.0    0.0    0.0        0.0         0.0  0.0  ...   \n",
       "4  0.0  0.0  0.0  0.0  0.0    0.0    0.0        0.0         0.0  0.0  ...   \n",
       "\n",
       "   zlumber  zombie  zoo  zues  zumder  zur  zurlo  zus  zvonko  zyklon  \n",
       "0      0.0     0.0  0.0   0.0     0.0  0.0    0.0  0.0     0.0     0.0  \n",
       "1      0.0     0.0  0.0   0.0     0.0  0.0    0.0  0.0     0.0     0.0  \n",
       "2      0.0     0.0  0.0   0.0     0.0  0.0    0.0  0.0     0.0     0.0  \n",
       "3      0.0     0.0  0.0   0.0     0.0  0.0    0.0  0.0     0.0     0.0  \n",
       "4      0.0     0.0  0.0   0.0     0.0  0.0    0.0  0.0     0.0     0.0  \n",
       "\n",
       "[5 rows x 10039 columns]"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newsgroups_docs = preprocess_texts(newsgroups, custom_pipeline=[\n",
    "    'tokenize', \n",
    "    'remove_punct', \n",
    "    'remove_stopwords', \n",
    "    'lemmatize',\n",
    "    'remove_digits',\n",
    "    'bag_of_words_to_docs'\n",
    "])\n",
    "create_tfidf_matrix(newsgroups_docs).iloc[:5, 10:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ngrams Frequency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "FreqDist({('Lines', ''): 492, ('Subject', 'Re'): 455, ('In', 'article'): 372, ('I', \"n't\"): 227, ('', 'In'): 204, ('I', 'think'): 169, ('article', ''): 151, ('article', '..'): 128, ('', ''): 126, ('writes', 'In'): 101, ...})"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newsgroups_ngrams = preprocess_texts(newsgroups, custom_pipeline=[\n",
    "    'tokenize', \n",
    "    'remove_punct',\n",
    "    'remove_digits',\n",
    "    'remove_stopwords',\n",
    "    'ngram_freq'\n",
    "])\n",
    "newsgroups_ngrams"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}