Text Preprocessing¶

[17]:

import pandas as pd
import itertools
from data_describe.text.text_preprocessing import *
from data_describe.misc.load_data import load_data

Load Data¶

[18]:

from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism']
newsgroups = fetch_20newsgroups(subset='train', categories=categories)['data']

[19]:

newsgroups[0][:100]

[19]:

'From: darice@yoyo.cc.monash.edu.au (Fred Rice)\nSubject: Re: Islam & Dress Code for women\nOrganizatio'

Tokenize¶

[20]:

newsgroups_tokens = tokenize(newsgroups)

[21]:

to_list(newsgroups_tokens)[0][:10]

[21]:

['From',
 ':',
 'darice',
 '@',
 'yoyo.cc.monash.edu.au',
 '(',
 'Fred',
 'Rice',
 ')',
 'Subject']

Change to all lowercase¶

[22]:

newsgroups_tokens = tokenize(newsgroups)
newsgroups_lower = to_lower(newsgroups_tokens)
to_list(newsgroups_lower)[0][:10]

[22]:

['from',
 ':',
 'darice',
 '@',
 'yoyo.cc.monash.edu.au',
 '(',
 'fred',
 'rice',
 ')',
 'subject']

Run a preprocessing pipeline in one line¶

[23]:

to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'to_lower']))[0][:10]

[23]:

['from',
 ':',
 'darice',
 '@',
 'yoyo.cc.monash.edu.au',
 '(',
 'fred',
 'rice',
 ')',
 'subject']

Remove punctuation¶

[24]:

to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct']))[0][:10]

[24]:

['From',
 'darice',
 'yoyo.cc.monash.edu.au',
 'Fred',
 'Rice',
 'Subject',
 'Re',
 'Islam',
 'Dress',
 'Code']

Remove digits¶

[25]:

digits_test_list = [['this', 'is', '3', 'a', 'test', '2c', 'if', 'it', 'works']]
to_list(preprocess_texts(digits_test_list, custom_pipeline=['remove_digits']))[0]

[25]:

['this', 'is', '', 'a', 'test', '', 'if', 'it', 'works']

Remove single characters and spaces¶

[26]:

single_char_spaces_test_list = [['this', 'is', '   ', 'a', 'test', '   ', 'b']]
to_list(preprocess_texts(single_char_spaces_test_list, custom_pipeline=['remove_single_char_and_spaces']))[0]

[26]:

['this', 'is', 'test']

Remove stopwords¶

[27]:

to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords']))[0][:10]

[27]:

['From',
 'darice',
 'yoyo.cc.monash.edu.au',
 'Fred',
 'Rice',
 'Subject',
 'Re',
 'Islam',
 'Dress',
 'Code']

Stem words¶

[28]:

to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords', 'stem']))[0][:10]

[28]:

['from',
 'dar',
 'yoyo.cc.monash.edu.au',
 'fred',
 'ric',
 'subject',
 're',
 'islam',
 'dress',
 'cod']

Lemmatize words¶

[29]:

to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', 'remove_punct', 'remove_stopwords', 'lemmatize']))[0][:10]

[29]:

['From',
 'darice',
 'yoyo.cc.monash.edu.au',
 'Fred',
 'Rice',
 'Subject',
 'Re',
 'Islam',
 'Dress',
 'Code']

Custom Function¶

[30]:

def shout(text_docs_bow):
    return ((word.upper() for word in doc) for doc in text_docs_bow)

to_list(preprocess_texts(newsgroups, custom_pipeline=['tokenize', shout]))[0][:10]

[30]:

['FROM',
 ':',
 'DARICE',
 '@',
 'YOYO.CC.MONASH.EDU.AU',
 '(',
 'FRED',
 'RICE',
 ')',
 'SUBJECT']

Convert back to a single string¶

[31]:

to_list(preprocess_texts(newsgroups, custom_pipeline=[
    'tokenize',
    'remove_punct',
    'remove_stopwords',
    'lemmatize',
    'remove_digits',
    'bag_of_words_to_docs'
]))[0][:1000]

[31]:

"From darice yoyo.cc.monash.edu.au Fred Rice Subject Re Islam Dress Code woman Organization Monash University Melb Australia Lines  In . .rz.tu-bs.de  .rz.tu-bs.de Benedikt Rosenau writes In article .. .cc.monash.edu.au darice yoyo.cc.monash.edu.au Fred Rice writes Deletion Of course people say think religion exactly coming different people within religion There nothing existing different perspective within religion perhaps one say tend converge truth My point lot harm way meantime And converge counterfactual religion appear split diverge Even might True Religion core layer determine happens practise quite inhumane usually What post supposed answer I n't see got I say I repeat Religions harm people And religion converge split Giving disagree upon And lot disagreement one tolerant one tolerant Ideologies also split giving disagree upon may also lead intolerance So also oppose ideology I n't think argument argument religion point weakness human nature Big deletion  Do woman soul Islam Peo"

Create a document-word frequency matrix¶

[32]:

newsgroups_docs = preprocess_texts(newsgroups, custom_pipeline=[
    'tokenize',
    'remove_punct',
    'remove_stopwords',
    'lemmatize',
    'remove_digits',
    'bag_of_words_to_docs'
])
create_doc_term_matrix(newsgroups_docs).iloc[:5, 10:]

[32]:

	_o	aa	aaa	aah	aap	aario	aaron	abandoned	abberation	abc	...	zlumber	zombie	zoo	zues	zumder	zur	zurlo	zus	zvonko	zyklon
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

5 rows × 10039 columns

Create a TF-IDF matrix¶

[35]:

newsgroups_docs = preprocess_texts(newsgroups, custom_pipeline=[
    'tokenize',
    'remove_punct',
    'remove_stopwords',
    'lemmatize',
    'remove_digits',
    'bag_of_words_to_docs'
])
create_tfidf_matrix(newsgroups_docs).iloc[:5, 10:]

[35]:

	_o	aa	aaa	aah	aap	aario	aaron	abandoned	abberation	abc	...	zlumber	zombie	zoo	zues	zumder	zur	zurlo	zus	zvonko	zyklon
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

5 rows × 10039 columns

Ngrams Frequency¶

[34]:

newsgroups_ngrams = preprocess_texts(newsgroups, custom_pipeline=[
    'tokenize',
    'remove_punct',
    'remove_digits',
    'remove_stopwords',
    'ngram_freq'
])
newsgroups_ngrams

[34]:

FreqDist({('Lines', ''): 492, ('Subject', 'Re'): 455, ('In', 'article'): 372, ('I', "n't"): 227, ('', 'In'): 204, ('I', 'think'): 169, ('article', ''): 151, ('article', '..'): 128, ('', ''): 126, ('writes', 'In'): 101, ...})