Text Preprocessing

[17]:
import pandas as pd
import itertools
from data_describe.text.text_preprocessing import *
from data_describe.misc.load_data import load_data

Load Data

[18]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism']
newsgroups = fetch_20newsgroups(subset='train', categories=categories)['data']
[19]:
newsgroups[0][:100]
[19]:
'From: darice@yoyo.cc.monash.edu.au (Fred Rice)\nSubject: Re: Islam & Dress Code for women\nOrganizatio'

Tokenize

[20]:
newsgroups_tokens = tokenize(newsgroups)
[21]:
to_list(newsgroups_tokens)[0][:10]
[21]:
['From',
 ':',
 'darice',
 '@',
 'yoyo.cc.monash.edu.au',
 '(',
 'Fred',
 'Rice',
 ')',
 'Subject']

Change to all lowercase

[22]:
newsgroups_tokens = tokenize(newsgroups)
newsgroups_lower = to_lower(newsgroups_tokens)
to_list(newsgroups_lower)[0][:10]
[22]:
['from',
 ':',
 'darice',
 '@',
 'yoyo.cc.monash.edu.au',
 '(',
 'fred',
 'rice',
 ')',
 'subject']

Run a preprocessing pipeline in one line

[23]:
to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'to_lower']))[0][:10]
[23]:
['from',
 ':',
 'darice',
 '@',
 'yoyo.cc.monash.edu.au',
 '(',
 'fred',
 'rice',
 ')',
 'subject']

Remove punctuation

[24]:
to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct']))[0][:10]
[24]:
['From',
 'darice',
 'yoyo.cc.monash.edu.au',
 'Fred',
 'Rice',
 'Subject',
 'Re',
 'Islam',
 'Dress',
 'Code']

Remove digits

[25]:
digits_test_list = [['this', 'is', '3', 'a', 'test', '2c', 'if', 'it', 'works']]
to_list(preprocess_texts(digits_test_list, custom_pipeline=['remove_digits']))[0]
[25]:
['this', 'is', '', 'a', 'test', '', 'if', 'it', 'works']

Remove single characters and spaces

[26]:
single_char_spaces_test_list = [['this', 'is', '   ', 'a', 'test', '   ', 'b']]
to_list(preprocess_texts(single_char_spaces_test_list, custom_pipeline=['remove_single_char_and_spaces']))[0]
[26]:
['this', 'is', 'test']

Remove stopwords

[27]:
to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords']))[0][:10]
[27]:
['From',
 'darice',
 'yoyo.cc.monash.edu.au',
 'Fred',
 'Rice',
 'Subject',
 'Re',
 'Islam',
 'Dress',
 'Code']

Stem words

[28]:
to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords', 'stem']))[0][:10]
[28]:
['from',
 'dar',
 'yoyo.cc.monash.edu.au',
 'fred',
 'ric',
 'subject',
 're',
 'islam',
 'dress',
 'cod']

Lemmatize words

[29]:
to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords', 'lemmatize']))[0][:10]
[29]:
['From',
 'darice',
 'yoyo.cc.monash.edu.au',
 'Fred',
 'Rice',
 'Subject',
 'Re',
 'Islam',
 'Dress',
 'Code']

Custom Function

[30]:
def shout(text_docs_bow):
    return ((word.upper() for word in doc) for doc in text_docs_bow)

to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', shout]))[0][:10]
[30]:
['FROM',
 ':',
 'DARICE',
 '@',
 'YOYO.CC.MONASH.EDU.AU',
 '(',
 'FRED',
 'RICE',
 ')',
 'SUBJECT']

Convert back to a single string

[31]:
to_list(preprocess_texts(newsgroups, custom_pipeline=[
    'tokenize',
    'remove_punct',
    'remove_stopwords',
    'lemmatize',
    'remove_digits',
    'bag_of_words_to_docs'
]))[0][:1000]
[31]:
"From darice yoyo.cc.monash.edu.au Fred Rice Subject Re Islam Dress Code woman Organization Monash University Melb Australia Lines  In . .rz.tu-bs.de  .rz.tu-bs.de Benedikt Rosenau writes In article .. .cc.monash.edu.au darice yoyo.cc.monash.edu.au Fred Rice writes Deletion Of course people say think religion exactly coming different people within religion There nothing existing different perspective within religion perhaps one say tend converge truth My point lot harm way meantime And converge counterfactual religion appear split diverge Even might True Religion core layer determine happens practise quite inhumane usually What post supposed answer I n't see got I say I repeat Religions harm people And religion converge split Giving disagree upon And lot disagreement one tolerant one tolerant Ideologies also split giving disagree upon may also lead intolerance So also oppose ideology I n't think argument argument religion point weakness human nature Big deletion  Do woman soul Islam Peo"

Create a document-word frequency matrix

[32]:
newsgroups_docs = preprocess_texts(newsgroups, custom_pipeline=[
    'tokenize',
    'remove_punct',
    'remove_stopwords',
    'lemmatize',
    'remove_digits',
    'bag_of_words_to_docs'
])
create_doc_term_matrix(newsgroups_docs).iloc[:5, 10:]
[32]:
_o aa aaa aah aap aario aaron abandoned abberation abc ... zlumber zombie zoo zues zumder zur zurlo zus zvonko zyklon
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 10039 columns

Create a TF-IDF matrix

[35]:
newsgroups_docs = preprocess_texts(newsgroups, custom_pipeline=[
    'tokenize',
    'remove_punct',
    'remove_stopwords',
    'lemmatize',
    'remove_digits',
    'bag_of_words_to_docs'
])
create_tfidf_matrix(newsgroups_docs).iloc[:5, 10:]
[35]:
_o aa aaa aah aap aario aaron abandoned abberation abc ... zlumber zombie zoo zues zumder zur zurlo zus zvonko zyklon
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 10039 columns

Ngrams Frequency

[34]:
newsgroups_ngrams = preprocess_texts(newsgroups, custom_pipeline=[
    'tokenize',
    'remove_punct',
    'remove_digits',
    'remove_stopwords',
    'ngram_freq'
])
newsgroups_ngrams
[34]:
FreqDist({('Lines', ''): 492, ('Subject', 'Re'): 455, ('In', 'article'): 372, ('I', "n't"): 227, ('', 'In'): 204, ('I', 'think'): 169, ('article', ''): 151, ('article', '..'): 128, ('', ''): 126, ('writes', 'In'): 101, ...})