{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Summary"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import datetime\n",
"from data_describe import data_summary\n",
"from datetime import datetime"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" CRIM | \n",
" ZN | \n",
" INDUS | \n",
" CHAS | \n",
" NOX | \n",
" RM | \n",
" AGE | \n",
" DIS | \n",
" RAD | \n",
" TAX | \n",
" PTRATIO | \n",
" B | \n",
" LSTAT | \n",
" target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.00632 | \n",
" 18.0 | \n",
" 2.31 | \n",
" 0.0 | \n",
" 0.538 | \n",
" 6.575 | \n",
" 65.2 | \n",
" 4.09 | \n",
" 1.0 | \n",
" 296.0 | \n",
" 15.3 | \n",
" 396.9 | \n",
" 4.98 | \n",
" 24.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO \\\n",
"0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.09 1.0 296.0 15.3 \n",
"\n",
" B LSTAT target \n",
"0 396.9 4.98 24.0 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.datasets import load_boston\n",
"data = load_boston()\n",
"df = pd.DataFrame(data.data, columns=list(data.feature_names))\n",
"df['target'] = data.target\n",
"df.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Change data types to demonstrate data summary\n",
"df['AGE'] = df['AGE'].map(lambda x: \"young\" if x < 29 else \"old\")\n",
"df[\"AgeFlag\"] = df['AGE'].astype(bool)\n",
"df['ZN'] = df['ZN'].astype(int)\n",
"df['Date'] = datetime.strptime('1/1/2008 1:30 PM', '%m/%d/%Y %I:%M %p')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" CRIM | \n",
" ZN | \n",
" INDUS | \n",
" CHAS | \n",
" NOX | \n",
" RM | \n",
" AGE | \n",
" DIS | \n",
" RAD | \n",
" TAX | \n",
" PTRATIO | \n",
" B | \n",
" LSTAT | \n",
" target | \n",
" AgeFlag | \n",
" Date | \n",
"
\n",
" \n",
" \n",
" \n",
" Data Type | \n",
" float64 | \n",
" int32 | \n",
" float64 | \n",
" float64 | \n",
" float64 | \n",
" float64 | \n",
" object | \n",
" float64 | \n",
" float64 | \n",
" float64 | \n",
" float64 | \n",
" float64 | \n",
" float64 | \n",
" float64 | \n",
" bool | \n",
" datetime64[ns] | \n",
"
\n",
" \n",
" Mean | \n",
" 3.61352 | \n",
" 11.3478 | \n",
" 11.1368 | \n",
" 0.06917 | \n",
" 0.554695 | \n",
" 6.28463 | \n",
" | \n",
" 3.79504 | \n",
" 9.54941 | \n",
" 408.237 | \n",
" 18.4555 | \n",
" 356.674 | \n",
" 12.6531 | \n",
" 22.5328 | \n",
" 1 | \n",
" 2008-01-01 13:30:00 | \n",
"
\n",
" \n",
" Standard Deviation | \n",
" 8.60155 | \n",
" 23.3106 | \n",
" 6.86035 | \n",
" 0.253994 | \n",
" 0.115878 | \n",
" 0.702617 | \n",
" | \n",
" 2.10571 | \n",
" 8.70726 | \n",
" 168.537 | \n",
" 2.16495 | \n",
" 91.2949 | \n",
" 7.14106 | \n",
" 9.1971 | \n",
" 0 | \n",
" | \n",
"
\n",
" \n",
" Median | \n",
" 0.25651 | \n",
" 0 | \n",
" 9.69 | \n",
" 0 | \n",
" 0.538 | \n",
" 6.2085 | \n",
" | \n",
" 3.20745 | \n",
" 5 | \n",
" 330 | \n",
" 19.05 | \n",
" 391.44 | \n",
" 11.36 | \n",
" 21.2 | \n",
" 1 | \n",
" | \n",
"
\n",
" \n",
" Min | \n",
" 0.00632 | \n",
" 0 | \n",
" 0.46 | \n",
" 0 | \n",
" 0.385 | \n",
" 3.561 | \n",
" | \n",
" 1.1296 | \n",
" 1 | \n",
" 187 | \n",
" 12.6 | \n",
" 0.32 | \n",
" 1.73 | \n",
" 5 | \n",
" | \n",
" | \n",
"
\n",
" \n",
" Max | \n",
" 88.9762 | \n",
" 100 | \n",
" 27.74 | \n",
" 1 | \n",
" 0.871 | \n",
" 8.78 | \n",
" | \n",
" 12.1265 | \n",
" 24 | \n",
" 711 | \n",
" 22 | \n",
" 396.9 | \n",
" 37.97 | \n",
" 50 | \n",
" | \n",
" | \n",
"
\n",
" \n",
" # Zeros | \n",
" 0 | \n",
" 372 | \n",
" 0 | \n",
" 471 | \n",
" 0 | \n",
" 0 | \n",
" | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" | \n",
" | \n",
"
\n",
" \n",
" # Nulls | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" % Most Frequent Value | \n",
" 0.4 | \n",
" 73.52 | \n",
" 26.09 | \n",
" 93.08 | \n",
" 4.55 | \n",
" 0.59 | \n",
" 88.14 | \n",
" 0.99 | \n",
" 26.09 | \n",
" 26.09 | \n",
" 27.67 | \n",
" 23.91 | \n",
" 0.59 | \n",
" 3.16 | \n",
" 100 | \n",
" 100 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" CRIM ZN INDUS CHAS NOX \\\n",
"Data Type float64 int32 float64 float64 float64 \n",
"Mean 3.61352 11.3478 11.1368 0.06917 0.554695 \n",
"Standard Deviation 8.60155 23.3106 6.86035 0.253994 0.115878 \n",
"Median 0.25651 0 9.69 0 0.538 \n",
"Min 0.00632 0 0.46 0 0.385 \n",
"Max 88.9762 100 27.74 1 0.871 \n",
"# Zeros 0 372 0 471 0 \n",
"# Nulls 0 0 0 0 0 \n",
"% Most Frequent Value 0.4 73.52 26.09 93.08 4.55 \n",
"\n",
" RM AGE DIS RAD TAX PTRATIO \\\n",
"Data Type float64 object float64 float64 float64 float64 \n",
"Mean 6.28463 3.79504 9.54941 408.237 18.4555 \n",
"Standard Deviation 0.702617 2.10571 8.70726 168.537 2.16495 \n",
"Median 6.2085 3.20745 5 330 19.05 \n",
"Min 3.561 1.1296 1 187 12.6 \n",
"Max 8.78 12.1265 24 711 22 \n",
"# Zeros 0 0 0 0 0 \n",
"# Nulls 0 0 0 0 0 0 \n",
"% Most Frequent Value 0.59 88.14 0.99 26.09 26.09 27.67 \n",
"\n",
" B LSTAT target AgeFlag Date \n",
"Data Type float64 float64 float64 bool datetime64[ns] \n",
"Mean 356.674 12.6531 22.5328 1 2008-01-01 13:30:00 \n",
"Standard Deviation 91.2949 7.14106 9.1971 0 \n",
"Median 391.44 11.36 21.2 1 \n",
"Min 0.32 1.73 5 \n",
"Max 396.9 37.97 50 \n",
"# Zeros 0 0 0 \n",
"# Nulls 0 0 0 0 0 \n",
"% Most Frequent Value 23.91 0.59 3.16 100 100 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_summary(df)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}