{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Topic Modeling" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from data_describe.text.topic_modeling import topic_model" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import fetch_20newsgroups\n", "categories = ['alt.atheism']\n", "newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame({\"text\": newsgroups_train['data']})" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
text
0From: darice@yoyo.cc.monash.edu.au (Fred Rice)...
1From: chrisb@tafe.sa.edu.au (Chris BELL)\\nSubj...
2Subject: Re: The Inimitable Rushdie\\nFrom: kma...
3From: timmbake@mcl.ucsb.edu (Bake Timmons)\\nSu...
4From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...
\n", "
" ], "text/plain": [ " text\n", "0 From: darice@yoyo.cc.monash.edu.au (Fred Rice)...\n", "1 From: chrisb@tafe.sa.edu.au (Chris BELL)\\nSubj...\n", "2 Subject: Re: The Inimitable Rushdie\\nFrom: kma...\n", "3 From: timmbake@mcl.ucsb.edu (Bake Timmons)\\nSu...\n", "4 From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro..." ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Explicitly providing number of topics" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Topic 1Topic 1 Coefficient ValueTopic 2Topic 2 Coefficient Value
Term 1|>0.025>>0.014
Term 2:0.009:0.010
Term 3-0.005God0.006
Term 4God0.005|0.004
Term 5much0.004them0.004
Term 6those0.004had0.004
Term 7way0.003|>0.004
Term 8time0.003-0.004
Term 9atheists0.003these0.004
Term 10may0.003atheists0.004
\n", "
" ], "text/plain": [ " Topic 1 Topic 1 Coefficient Value Topic 2 \\\n", "Term 1 |> 0.025 >> \n", "Term 2 : 0.009 : \n", "Term 3 - 0.005 God \n", "Term 4 God 0.005 | \n", "Term 5 much 0.004 them \n", "Term 6 those 0.004 had \n", "Term 7 way 0.003 |> \n", "Term 8 time 0.003 - \n", "Term 9 atheists 0.003 these \n", "Term 10 may 0.003 atheists \n", "\n", " Topic 2 Coefficient Value \n", "Term 1 0.014 \n", "Term 2 0.010 \n", "Term 3 0.006 \n", "Term 4 0.004 \n", "Term 5 0.004 \n", "Term 6 0.004 \n", "Term 7 0.004 \n", "Term 8 0.004 \n", "Term 9 0.004 \n", "Term 10 0.004 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lda_model = topic_model(df.text, num_topics=2)\n", "lda_model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Guess optimal number of topics and show elbow plot" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Topic 1Topic 1 Coefficient ValueTopic 2Topic 2 Coefficient ValueTopic 3Topic 3 Coefficient ValueTopic 4Topic 4 Coefficient ValueTopic 5Topic 5 Coefficient ValueTopic 6Topic 6 Coefficient ValueTopic 7Topic 7 Coefficient ValueTopic 8Topic 8 Coefficient Value
Term 1>>0.008God0.010:0.031>>0.013|>0.021>>0.010|>0.071>>0.026
Term 2had0.006|0.008-0.007God0.008system0.006evidence0.005:0.016Jesus0.007
Term 3|0.005-0.007God0.006atheists0.005Schneider)0.006am0.005>>0.007God0.006
Term 4-0.005atheists0.005also0.005those0.005keith@cco.caltech.edu0.006objective0.004Livesey)0.007them0.006
Term 5God0.005those0.005Islamic0.005moral0.005am0.005world0.004(Jon0.006things0.006
Term 6evidence0.004For0.005our0.004religious0.005Allan0.005read0.004livesey@solntze.wpd.sgi.com0.006atheists0.005
Term 7time0.004Islam0.004A0.004-0.004Institute0.004take0.004God0.005evidence0.004
Term 8it.0.004religion0.004may0.004our0.004objective0.004|>0.004them0.004it.0.004
Term 9We0.004>>0.004Islam0.004way0.004these0.004almost0.004moral0.004-0.004
Term 10|>0.004way0.004much0.003may0.004keith0.004atheists0.003A0.004They0.004
\n", "
" ], "text/plain": [ " Topic 1 Topic 1 Coefficient Value Topic 2 \\\n", "Term 1 >> 0.008 God \n", "Term 2 had 0.006 | \n", "Term 3 | 0.005 - \n", "Term 4 - 0.005 atheists \n", "Term 5 God 0.005 those \n", "Term 6 evidence 0.004 For \n", "Term 7 time 0.004 Islam \n", "Term 8 it. 0.004 religion \n", "Term 9 We 0.004 >> \n", "Term 10 |> 0.004 way \n", "\n", " Topic 2 Coefficient Value Topic 3 Topic 3 Coefficient Value \\\n", "Term 1 0.010 : 0.031 \n", "Term 2 0.008 - 0.007 \n", "Term 3 0.007 God 0.006 \n", "Term 4 0.005 also 0.005 \n", "Term 5 0.005 Islamic 0.005 \n", "Term 6 0.005 our 0.004 \n", "Term 7 0.004 A 0.004 \n", "Term 8 0.004 may 0.004 \n", "Term 9 0.004 Islam 0.004 \n", "Term 10 0.004 much 0.003 \n", "\n", " Topic 4 Topic 4 Coefficient Value Topic 5 \\\n", "Term 1 >> 0.013 |> \n", "Term 2 God 0.008 system \n", "Term 3 atheists 0.005 Schneider) \n", "Term 4 those 0.005 keith@cco.caltech.edu \n", "Term 5 moral 0.005 am \n", "Term 6 religious 0.005 Allan \n", "Term 7 - 0.004 Institute \n", "Term 8 our 0.004 objective \n", "Term 9 way 0.004 these \n", "Term 10 may 0.004 keith \n", "\n", " Topic 5 Coefficient Value Topic 6 Topic 6 Coefficient Value \\\n", "Term 1 0.021 >> 0.010 \n", "Term 2 0.006 evidence 0.005 \n", "Term 3 0.006 am 0.005 \n", "Term 4 0.006 objective 0.004 \n", "Term 5 0.005 world 0.004 \n", "Term 6 0.005 read 0.004 \n", "Term 7 0.004 take 0.004 \n", "Term 8 0.004 |> 0.004 \n", "Term 9 0.004 almost 0.004 \n", "Term 10 0.004 atheists 0.003 \n", "\n", " Topic 7 Topic 7 Coefficient Value Topic 8 \\\n", "Term 1 |> 0.071 >> \n", "Term 2 : 0.016 Jesus \n", "Term 3 >> 0.007 God \n", "Term 4 Livesey) 0.007 them \n", "Term 5 (Jon 0.006 things \n", "Term 6 livesey@solntze.wpd.sgi.com 0.006 atheists \n", "Term 7 God 0.005 evidence \n", "Term 8 them 0.004 it. \n", "Term 9 moral 0.004 - \n", "Term 10 A 0.004 They \n", "\n", " Topic 8 Coefficient Value \n", "Term 1 0.026 \n", "Term 2 0.007 \n", "Term 3 0.006 \n", "Term 4 0.006 \n", "Term 5 0.006 \n", "Term 6 0.005 \n", "Term 7 0.004 \n", "Term 8 0.004 \n", "Term 9 0.004 \n", "Term 10 0.004 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lda_model = topic_model(df.text, num_topics=None)\n", "lda_model" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", "DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "lda_model.elbow_plot()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }