Data SummaryΒΆ

[ ]:
import pandas as pd
import datetime
from data_describe import data_summary
from datetime import datetime
[2]:
from sklearn.datasets import load_boston
data = load_boston()
df = pd.DataFrame(data.data, columns=list(data.feature_names))
df['target'] = data.target
df.head(1)
[2]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT target
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.09 1.0 296.0 15.3 396.9 4.98 24.0
[3]:
# Change data types to demonstrate data summary
df['AGE'] = df['AGE'].map(lambda x: "young" if x < 29 else "old")
df["AgeFlag"] = df['AGE'].astype(bool)
df['ZN'] = df['ZN'].astype(int)
df['Date'] = datetime.strptime('1/1/2008 1:30 PM', '%m/%d/%Y %I:%M %p')
[4]:
data_summary(df)
[4]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT target AgeFlag Date
Data Type float64 int32 float64 float64 float64 float64 object float64 float64 float64 float64 float64 float64 float64 bool datetime64[ns]
Mean 3.61352 11.3478 11.1368 0.06917 0.554695 6.28463 3.79504 9.54941 408.237 18.4555 356.674 12.6531 22.5328 1 2008-01-01 13:30:00
Standard Deviation 8.60155 23.3106 6.86035 0.253994 0.115878 0.702617 2.10571 8.70726 168.537 2.16495 91.2949 7.14106 9.1971 0
Median 0.25651 0 9.69 0 0.538 6.2085 3.20745 5 330 19.05 391.44 11.36 21.2 1
Min 0.00632 0 0.46 0 0.385 3.561 1.1296 1 187 12.6 0.32 1.73 5
Max 88.9762 100 27.74 1 0.871 8.78 12.1265 24 711 22 396.9 37.97 50
# Zeros 0 372 0 471 0 0 0 0 0 0 0 0 0
# Nulls 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
% Most Frequent Value 0.4 73.52 26.09 93.08 4.55 0.59 88.14 0.99 26.09 26.09 27.67 23.91 0.59 3.16 100 100