{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Summary" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "import pandas as pd\n", "import datetime\n", "from data_describe import data_summary\n", "from datetime import datetime" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATtarget
00.0063218.02.310.00.5386.57565.24.091.0296.015.3396.94.9824.0
\n", "
" ], "text/plain": [ " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO \\\n", "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.09 1.0 296.0 15.3 \n", "\n", " B LSTAT target \n", "0 396.9 4.98 24.0 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.datasets import load_boston\n", "data = load_boston()\n", "df = pd.DataFrame(data.data, columns=list(data.feature_names))\n", "df['target'] = data.target\n", "df.head(1)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Change data types to demonstrate data summary\n", "df['AGE'] = df['AGE'].map(lambda x: \"young\" if x < 29 else \"old\")\n", "df[\"AgeFlag\"] = df['AGE'].astype(bool)\n", "df['ZN'] = df['ZN'].astype(int)\n", "df['Date'] = datetime.strptime('1/1/2008 1:30 PM', '%m/%d/%Y %I:%M %p')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATtargetAgeFlagDate
Data Typefloat64int32float64float64float64float64objectfloat64float64float64float64float64float64float64booldatetime64[ns]
Mean3.6135211.347811.13680.069170.5546956.284633.795049.54941408.23718.4555356.67412.653122.532812008-01-01 13:30:00
Standard Deviation8.6015523.31066.860350.2539940.1158780.7026172.105718.70726168.5372.1649591.29497.141069.19710
Median0.2565109.6900.5386.20853.20745533019.05391.4411.3621.21
Min0.0063200.4600.3853.5611.1296118712.60.321.735
Max88.976210027.7410.8718.7812.12652471122396.937.9750
# Zeros03720471000000000
# Nulls0000000000000000
% Most Frequent Value0.473.5226.0993.084.550.5988.140.9926.0926.0927.6723.910.593.16100100
\n", "
" ], "text/plain": [ " CRIM ZN INDUS CHAS NOX \\\n", "Data Type float64 int32 float64 float64 float64 \n", "Mean 3.61352 11.3478 11.1368 0.06917 0.554695 \n", "Standard Deviation 8.60155 23.3106 6.86035 0.253994 0.115878 \n", "Median 0.25651 0 9.69 0 0.538 \n", "Min 0.00632 0 0.46 0 0.385 \n", "Max 88.9762 100 27.74 1 0.871 \n", "# Zeros 0 372 0 471 0 \n", "# Nulls 0 0 0 0 0 \n", "% Most Frequent Value 0.4 73.52 26.09 93.08 4.55 \n", "\n", " RM AGE DIS RAD TAX PTRATIO \\\n", "Data Type float64 object float64 float64 float64 float64 \n", "Mean 6.28463 3.79504 9.54941 408.237 18.4555 \n", "Standard Deviation 0.702617 2.10571 8.70726 168.537 2.16495 \n", "Median 6.2085 3.20745 5 330 19.05 \n", "Min 3.561 1.1296 1 187 12.6 \n", "Max 8.78 12.1265 24 711 22 \n", "# Zeros 0 0 0 0 0 \n", "# Nulls 0 0 0 0 0 0 \n", "% Most Frequent Value 0.59 88.14 0.99 26.09 26.09 27.67 \n", "\n", " B LSTAT target AgeFlag Date \n", "Data Type float64 float64 float64 bool datetime64[ns] \n", "Mean 356.674 12.6531 22.5328 1 2008-01-01 13:30:00 \n", "Standard Deviation 91.2949 7.14106 9.1971 0 \n", "Median 391.44 11.36 21.2 1 \n", "Min 0.32 1.73 5 \n", "Max 396.9 37.97 50 \n", "# Zeros 0 0 0 \n", "# Nulls 0 0 0 0 0 \n", "% Most Frequent Value 23.91 0.59 3.16 100 100 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_summary(df)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }