{ "cells": [ { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.010558, "end_time": "2020-11-11T19:35:45.556765", "exception": false, "start_time": "2020-11-11T19:35:45.546207", "status": "completed" }, "tags": [] }, "source": [ "# Topic Modeling" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2020-11-11T19:35:45.574431Z", "iopub.status.busy": "2020-11-11T19:35:45.573676Z", "iopub.status.idle": "2020-11-11T19:35:46.191686Z", "shell.execute_reply": "2020-11-11T19:35:46.192136Z" }, "papermill": { "duration": 0.628707, "end_time": "2020-11-11T19:35:46.192377", "exception": false, "start_time": "2020-11-11T19:35:45.563670", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2020-11-11T19:35:46.210066Z", "iopub.status.busy": "2020-11-11T19:35:46.209014Z", "iopub.status.idle": "2020-11-11T19:35:47.469660Z", "shell.execute_reply": "2020-11-11T19:35:47.470105Z" }, "papermill": { "duration": 1.270277, "end_time": "2020-11-11T19:35:47.470298", "exception": false, "start_time": "2020-11-11T19:35:46.200021", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "from data_describe.text.topic_modeling import topic_model" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2020-11-11T19:35:47.488273Z", "iopub.status.busy": "2020-11-11T19:35:47.487406Z", "iopub.status.idle": "2020-11-11T19:35:47.815155Z", "shell.execute_reply": "2020-11-11T19:35:47.815585Z" }, "papermill": { "duration": 0.340094, "end_time": "2020-11-11T19:35:47.815778", "exception": false, "start_time": "2020-11-11T19:35:47.475684", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "from sklearn.datasets import fetch_20newsgroups\n", "categories = ['alt.atheism']\n", "newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2020-11-11T19:35:47.832912Z", "iopub.status.busy": "2020-11-11T19:35:47.831828Z", "iopub.status.idle": "2020-11-11T19:35:47.834585Z", "shell.execute_reply": "2020-11-11T19:35:47.835419Z" }, "papermill": { "duration": 0.014291, "end_time": "2020-11-11T19:35:47.835700", "exception": false, "start_time": "2020-11-11T19:35:47.821409", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "df = pd.DataFrame({\"text\": newsgroups_train['data']})" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2020-11-11T19:35:47.857449Z", "iopub.status.busy": "2020-11-11T19:35:47.856585Z", "iopub.status.idle": "2020-11-11T19:35:47.863031Z", "shell.execute_reply": "2020-11-11T19:35:47.863725Z" }, "papermill": { "duration": 0.020762, "end_time": "2020-11-11T19:35:47.863997", "exception": false, "start_time": "2020-11-11T19:35:47.843235", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
text
0From: darice@yoyo.cc.monash.edu.au (Fred Rice)...
1From: chrisb@tafe.sa.edu.au (Chris BELL)\\nSubj...
2Subject: Re: The Inimitable Rushdie\\nFrom: kma...
3From: timmbake@mcl.ucsb.edu (Bake Timmons)\\nSu...
4From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...
\n", "
" ], "text/plain": [ " text\n", "0 From: darice@yoyo.cc.monash.edu.au (Fred Rice)...\n", "1 From: chrisb@tafe.sa.edu.au (Chris BELL)\\nSubj...\n", "2 Subject: Re: The Inimitable Rushdie\\nFrom: kma...\n", "3 From: timmbake@mcl.ucsb.edu (Bake Timmons)\\nSu...\n", "4 From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro..." ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.011584, "end_time": "2020-11-11T19:35:47.887013", "exception": false, "start_time": "2020-11-11T19:35:47.875429", "status": "completed" }, "tags": [] }, "source": [ "## Explicitly providing number of topics" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2020-11-11T19:35:47.910115Z", "iopub.status.busy": "2020-11-11T19:35:47.908989Z", "iopub.status.idle": "2020-11-11T19:35:49.848353Z", "shell.execute_reply": "2020-11-11T19:35:49.848764Z" }, "papermill": { "duration": 1.953953, "end_time": "2020-11-11T19:35:49.848963", "exception": false, "start_time": "2020-11-11T19:35:47.895010", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Topic 1Topic 1 Coefficient ValueTopic 2Topic 2 Coefficient Value
Term 1|>0.017|>0.010
Term 2>>0.013God0.007
Term 3:0.013-0.005
Term 4God0.005evidence0.004
Term 5-0.004:0.004
Term 6atheists0.004moral0.004
Term 7those0.004our0.004
Term 8|0.004these0.004
Term 9A0.003Jesus0.003
Term 10had0.003them0.003
\n", "
" ], "text/plain": [ " Topic 1 Topic 1 Coefficient Value Topic 2 \\\n", "Term 1 |> 0.017 |> \n", "Term 2 >> 0.013 God \n", "Term 3 : 0.013 - \n", "Term 4 God 0.005 evidence \n", "Term 5 - 0.004 : \n", "Term 6 atheists 0.004 moral \n", "Term 7 those 0.004 our \n", "Term 8 | 0.004 these \n", "Term 9 A 0.003 Jesus \n", "Term 10 had 0.003 them \n", "\n", " Topic 2 Coefficient Value \n", "Term 1 0.010 \n", "Term 2 0.007 \n", "Term 3 0.005 \n", "Term 4 0.004 \n", "Term 5 0.004 \n", "Term 6 0.004 \n", "Term 7 0.004 \n", "Term 8 0.004 \n", "Term 9 0.003 \n", "Term 10 0.003 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lda_model = topic_model(df.text, num_topics=2)\n", "lda_model" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.007942, "end_time": "2020-11-11T19:35:49.863121", "exception": false, "start_time": "2020-11-11T19:35:49.855179", "status": "completed" }, "tags": [] }, "source": [ "## Guess optimal number of topics and show elbow plot" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2020-11-11T19:35:49.964159Z", "iopub.status.busy": "2020-11-11T19:35:49.920117Z", "iopub.status.idle": "2020-11-11T19:35:54.628146Z", "shell.execute_reply": "2020-11-11T19:35:54.628543Z" }, "papermill": { "duration": 4.759244, "end_time": "2020-11-11T19:35:54.628726", "exception": false, "start_time": "2020-11-11T19:35:49.869482", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Fitting topic model: 100%|██████████| 3/3 [00:04<00:00, 1.48s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 9.64 s, sys: 260 ms, total: 9.9 s\n", "Wall time: 4.73 s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Topic 1Topic 1 Coefficient ValueTopic 2Topic 2 Coefficient ValueTopic 3Topic 3 Coefficient ValueTopic 4Topic 4 Coefficient Value
Term 1>>0.019God0.009|>0.042:0.014
Term 2:0.009-0.006:0.010>>0.008
Term 3|>0.006argument0.005-0.005God0.006
Term 4-0.005evidence0.004God0.005them0.005
Term 5atheists0.005our0.004|0.005evidence0.004
Term 6God0.004moral0.004(Jon0.005Jesus0.004
Term 7had0.004these0.004Livesey)0.004our0.004
Term 8those0.004Islamic0.004moral0.004those0.004
Term 9its0.004had0.004livesey@solntze.wpd.sgi.com0.004these0.003
Term 10A0.004For0.004it.0.004read0.003
\n", "
" ], "text/plain": [ " Topic 1 Topic 1 Coefficient Value Topic 2 \\\n", "Term 1 >> 0.019 God \n", "Term 2 : 0.009 - \n", "Term 3 |> 0.006 argument \n", "Term 4 - 0.005 evidence \n", "Term 5 atheists 0.005 our \n", "Term 6 God 0.004 moral \n", "Term 7 had 0.004 these \n", "Term 8 those 0.004 Islamic \n", "Term 9 its 0.004 had \n", "Term 10 A 0.004 For \n", "\n", " Topic 2 Coefficient Value Topic 3 \\\n", "Term 1 0.009 |> \n", "Term 2 0.006 : \n", "Term 3 0.005 - \n", "Term 4 0.004 God \n", "Term 5 0.004 | \n", "Term 6 0.004 (Jon \n", "Term 7 0.004 Livesey) \n", "Term 8 0.004 moral \n", "Term 9 0.004 livesey@solntze.wpd.sgi.com \n", "Term 10 0.004 it. \n", "\n", " Topic 3 Coefficient Value Topic 4 Topic 4 Coefficient Value \n", "Term 1 0.042 : 0.014 \n", "Term 2 0.010 >> 0.008 \n", "Term 3 0.005 God 0.006 \n", "Term 4 0.005 them 0.005 \n", "Term 5 0.005 evidence 0.004 \n", "Term 6 0.005 Jesus 0.004 \n", "Term 7 0.004 our 0.004 \n", "Term 8 0.004 those 0.004 \n", "Term 9 0.004 these 0.003 \n", "Term 10 0.004 read 0.003 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "lda_model = topic_model(df.text, num_topics=None, min_topics=2, max_topics=4)\n", "lda_model" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2020-11-11T19:35:54.650188Z", "iopub.status.busy": "2020-11-11T19:35:54.649590Z", "iopub.status.idle": "2020-11-11T19:35:54.859040Z", "shell.execute_reply": "2020-11-11T19:35:54.859423Z" }, "papermill": { "duration": 0.222289, "end_time": "2020-11-11T19:35:54.859598", "exception": false, "start_time": "2020-11-11T19:35:54.637309", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "lda_model.elbow_plot()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" }, "papermill": { "default_parameters": {}, "duration": 11.01568, "end_time": "2020-11-11T19:35:55.384671", "environment_variables": {}, "exception": null, "input_path": "/Users/richardtruong-chau/Projects/data-describe/examples/Topic_Modeling.ipynb", "output_path": "/Users/richardtruong-chau/Projects/data-describe/examples/Topic_Modeling.ipynb", "parameters": {}, "start_time": "2020-11-11T19:35:44.368991", "version": "2.2.0" } }, "nbformat": 4, "nbformat_minor": 4 }