Text Preprocessing

import pandas as pd
import itertools
from data_describe.text.text_preprocessing import *
from data_describe.misc.load_data import load_data

Load Data

from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism']
newsgroups = fetch_20newsgroups(subset='train', categories=categories)['data']
'From: darice@yoyo.cc.monash.edu.au (Fred Rice)\nSubject: Re: Islam & Dress Code for women\nOrganizatio'


newsgroups_tokens = tokenize(newsgroups)

Change to all lowercase

newsgroups_tokens = tokenize(newsgroups)
newsgroups_lower = to_lower(newsgroups_tokens)

Run a preprocessing pipeline in one line

to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'to_lower']))[0][:10]

Remove punctuation

to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct']))[0][:10]

Remove digits

digits_test_list = [['this', 'is', '3', 'a', 'test', '2c', 'if', 'it', 'works']]
to_list(preprocess_texts(digits_test_list, custom_pipeline=['remove_digits']))[0]
['this', 'is', '', 'a', 'test', '', 'if', 'it', 'works']

Remove single characters and spaces

single_char_spaces_test_list = [['this', 'is', '   ', 'a', 'test', '   ', 'b']]
to_list(preprocess_texts(single_char_spaces_test_list, custom_pipeline=['remove_single_char_and_spaces']))[0]
['this', 'is', 'test']

Remove stopwords

to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords']))[0][:10]

Stem words

to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords', 'stem']))[0][:10]

Lemmatize words

to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords', 'lemmatize']))[0][:10]

Custom Function

def shout(text_docs_bow):
    return ((word.upper() for word in doc) for doc in text_docs_bow)

to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', shout]))[0][:10]

Convert back to a single string

to_list(preprocess_texts(newsgroups, custom_pipeline=[
"From darice yoyo.cc.monash.edu.au Fred Rice Subject Re Islam Dress Code woman Organization Monash University Melb Australia Lines  In . .rz.tu-bs.de  .rz.tu-bs.de Benedikt Rosenau writes In article .. .cc.monash.edu.au darice yoyo.cc.monash.edu.au Fred Rice writes Deletion Of course people say think religion exactly coming different people within religion There nothing existing different perspective within religion perhaps one say tend converge truth My point lot harm way meantime And converge counterfactual religion appear split diverge Even might True Religion core layer determine happens practise quite inhumane usually What post supposed answer I n't see got I say I repeat Religions harm people And religion converge split Giving disagree upon And lot disagreement one tolerant one tolerant Ideologies also split giving disagree upon may also lead intolerance So also oppose ideology I n't think argument argument religion point weakness human nature Big deletion  Do woman soul Islam Peo"

Create a document-word frequency matrix

newsgroups_docs = preprocess_texts(newsgroups, custom_pipeline=[
create_doc_term_matrix(newsgroups_docs).iloc[:5, 10:]
_o aa aaa aah aap aario aaron abandoned abberation abc ... zlumber zombie zoo zues zumder zur zurlo zus zvonko zyklon
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 10039 columns

Create a TF-IDF matrix

newsgroups_docs = preprocess_texts(newsgroups, custom_pipeline=[
create_tfidf_matrix(newsgroups_docs).iloc[:5, 10:]
_o aa aaa aah aap aario aaron abandoned abberation abc ... zlumber zombie zoo zues zumder zur zurlo zus zvonko zyklon
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 10039 columns

Ngrams Frequency

newsgroups_ngrams = preprocess_texts(newsgroups, custom_pipeline=[
FreqDist({('Lines', ''): 492, ('Subject', 'Re'): 455, ('In', 'article'): 372, ('I', "n't"): 227, ('', 'In'): 204, ('I', 'think'): 169, ('article', ''): 151, ('article', '..'): 128, ('', ''): 126, ('writes', 'In'): 101, ...})