Sensitive Data

[1]:
import datetime
import pandas as pd

from data_describe.privacy.detection import sensitive_data

Create Sample Profile

[2]:
sample_profile = {
    "company": {
        0: "Fisher, Green and Dixon",
        1: "Lawrence, Herring and Riley",
        2: "Thompson-Ruiz",
        3: "Sloan PLC",
        4: "Smith LLC",
        5: "Nolan, Meyers and Johnson",
    },
    "ssn": {
        0: "415-39-7809",
        1: "462-64-5856",
        2: "420-73-8333",
        3: "119-33-2186",
        4: "532-38-7349",
        5: "152-33-9873",
    },
    "residence": {
        0: "24219 Archer Mountain Suite 924\nNorth Melissaborough, LA 41945",
        1: "5330 Wilson Fields Suite 560\nEast Heiditown, VA 70519",
        2: "PSC 5642, Box 8071\nAPO AA 06490",
        3: "1240 Jamie Forks Apt. 590\nAlistad, NY 60619",
        4: "PSC 9361, Box 5349\nAPO AP 57022",
        5: "7118 Williams Flat Apt. 075\nOwenhaven, LA 50600",
    },
    "website": {
        0: [
            "http://sellers.com/",
            "https://garrett.com/",
            "https://stark.net/",
            "http://kaiser.org/",
        ],
        1: ["https://wood-hooper.com/", "http://martinez.net/"],
        2: [
            "http://www.arroyo-schultz.biz/",
            "https://www.curtis-smith.com/",
            "http://www.gray-hutchinson.com/",
            "http://www.barnes.com/",
        ],
        3: ["http://hernandez.info/", "https://www.williams-martin.org/"],
        4: ["https://www.wilson.com/"],
        5: ["https://mooney.com/"],
    },
    "username": {
        0: "sandraharris",
        1: "jeffreylucas",
        2: "karla07",
        3: "johnwilliams",
        4: "amyhernandez",
        5: "eprice",
    },
    "name": {
        0: "Doris Martinez",
        1: "Jeffery Garcia",
        2: "Kelsey Freeman",
        3: "Kimberly Carter",
        4: "Charles Gonzalez",
        5: "Roger Olson",
    },
    "address": {
        0: "19659 Ivan Stravenue Apt. 471\nLake Nancyside, VT 71358",
        1: "0916 Michael Row\nSellersville, WI 08109",
        2: "63812 Morales Ranch Apt. 300\nLowestad, NM 26520",
        3: "65461 Regina Mall Suite 517\nSouth Benjaminborough, DE 22331",
        4: "Unit 7296 Box 6875\nDPO AP 65859",
        5: "0248 Cook Mews Apt. 466\nBrownfurt, IN 44282",
    },
    "mail": {
        0: "mary84@yahoo.com",
        1: "imoore@yahoo.com",
        2: "yramirez@gmail.com",
        3: "nicholas11@hotmail.com",
        4: "nancy89@hotmail.com",
        5: "johnsonrobert@yahoo.com",
    },
    "birthdate": {
        0: datetime.date(1936, 7, 5),
        1: datetime.date(1920, 5, 30),
        2: datetime.date(1958, 6, 13),
        3: datetime.date(1931, 5, 31),
        4: datetime.date(1905, 10, 12),
        5: datetime.date(1986, 5, 21),
    }

}
[3]:
df = pd.DataFrame(sample_profile)
df.head(2)
[3]:
company ssn residence website username name address mail birthdate
0 Fisher, Green and Dixon 415-39-7809 24219 Archer Mountain Suite 924\nNorth Melissa... [http://sellers.com/, https://garrett.com/, ht... sandraharris Doris Martinez 19659 Ivan Stravenue Apt. 471\nLake Nancyside,... mary84@yahoo.com 1936-07-05
1 Lawrence, Herring and Riley 462-64-5856 5330 Wilson Fields Suite 560\nEast Heiditown, ... [https://wood-hooper.com/, http://martinez.net/] jeffreylucas Jeffery Garcia 0916 Michael Row\nSellersville, WI 08109 imoore@yahoo.com 1920-05-30

Redact sensitive data

[4]:
sensitive_data(df, mode='redact', sample_size=len(df))
company ssn residence website username name address mail birthdate
0 <PERSON>, <PERSON> and <PERSON> <US_SSN> 24219 <LOCATION> Suite 924\nNorth Melissaborou... ['http://<DOMAIN_NAME>/', 'https://<DOMAIN_NAM... sandraharris <PERSON> <DATE_TIME> <PERSON> Apt. 471\nLake Nancyside,... <EMAIL_ADDRESS> <DATE_TIME>
1 <PERSON>, <PERSON> and <PERSON> <US_SSN> 5330 Wilson Fields Suite 560\n<LOCATION>, <LOC... ['https://<DOMAIN_NAME>/', 'http://<DOMAIN_NAM... jeffreylucas <PERSON> <DATE_TIME><US_DRIVER_LICENSE>, <LOCATION> 08109 <EMAIL_ADDRESS> <DATE_TIME>
2 Thompson-Ruiz <US_SSN> PSC 5642, Box 8071\nAPO AA 06490 ['http://<DOMAIN_NAME>/', 'https://<DOMAIN_NAM... karla07 <PERSON> <DATE_TIME> <PERSON> Ranch Apt. 300\n<PERSON>,... <EMAIL_ADDRESS> <DATE_TIME>
3 Sloan PLC <US_SSN> 1240 Jamie Forks Apt. 590\n<PERSON>, <LOCATION... ['http://<DOMAIN_NAME>/', 'https://<DOMAIN_NAM... johnwilliams <PERSON> 65461 Regina Mall Suite 517\nSouth Benjaminbor... <EMAIL_ADDRESS> <DATE_TIME>
4 Smith LLC <US_SSN> PSC 9361, Box 5349\nAPO AP 57022 ['https://<DOMAIN_NAME>/'] amyhernandez <PERSON> Unit 7296 Box 6875\nDPO AP <DATE_TIME> <EMAIL_ADDRESS> <DATE_TIME>
5 <PERSON>, <PERSON> and <PERSON> <US_SSN> <DATE_TIME> <PERSON>. 075\nOwenhaven, LA 50600 ['https://<DOMAIN_NAME>/'] eprice <PERSON> 0248 Cook Mews Apt. 466\nBrownfurt, IN 44282 <EMAIL_ADDRESS> <DATE_TIME>
[4]:
<data_describe.privacy.detection.SensitiveDataWidget at 0x7fbdf59d4390>

Redact sensitive data using selected columns

[5]:
sensitive_data(df, mode="redact", columns=["birthdate", "mail", "ssn"], sample_size=len(df), detect_infotypes=True)
birthdate mail ssn
0 <DATE_TIME> <EMAIL_ADDRESS> <US_SSN>
1 <DATE_TIME> <EMAIL_ADDRESS> <US_SSN>
2 <DATE_TIME> <EMAIL_ADDRESS> <US_SSN>
3 <DATE_TIME> <EMAIL_ADDRESS> <US_SSN>
4 <DATE_TIME> <EMAIL_ADDRESS> <US_SSN>
5 <DATE_TIME> <EMAIL_ADDRESS> <US_SSN>
[5]:
<data_describe.privacy.detection.SensitiveDataWidget at 0x7fbdfc187190>

Encrypt Data

[6]:
# encrypt with SHA256
sensitivewidget = sensitive_data(df, mode="encrypt", sample_size=len(df))
sensitivewidget
company ssn residence website username name address mail birthdate
0 ea49f5ee27409740ce7f74997ccf0184f3bbe9a72a86c0... 9fc2a689acf5f89d888c7acb3d43add7c4ddb320ca0fe0... 24219 2cee30d46d5a9c636d0ce08afea6aaba9b48cd9d... ['http://7f6bcdcd8eb28186cd2bd93cdd66bf2abba0f... sandraharris 779b878ad9f68899e5de5daac8322dd5b3a3991b10e3ed... 26f70e334556abc2841f5312a9984aa9a7f21b1924015d... b43ca3c683dabab5b42fee6181836eeea80dad950c1428... bd4b000796139f1fe01d62a9e6ccc8d50c8ca861fedd9c...
1 76364d81ab4c1832d9181572cc0d408f7bdd3900a61113... 567e4a168b80d74c435b22ec74bf56f52d1480372ee3db... 5330 Wilson Fields Suite 560\n2ab6e79e04018666... ['https://b5e885e576b91d7f332cecd36e1078e1c8ff... jeffreylucas b79d0309ea6eb45b8e8dee995c996184641de208262b0b... b6e3258d0f1106cf9ac124221f4bbdd7c327eb5dcb5880... 42ea5e56df211dbed485dc611277fe0932bb3139c4815e... 7e7bebf19314f96de0b1231c7c89627c3a87dd110c2614...
2 Thompson-Ruiz 06f11d164b97a22cd2a3a2a1a1b6df50268a8a21ee8a97... PSC 5642, Box 8071\nAPO AA 06490 ['http://7aad9afb51f77260b55915f74a5498b5850cc... karla07 08471f7bf132fdfb2a1a9ad7529fca6942425dc9d32834... 51ae0e05ffebf0ab306c50cfaa433765e089e781ddbfaf... 4ca073222470748ed4fb77ad0011402af414dd561cd645... 1b739ba212c2fec7848fbe6cffeffeb2f3737f40748542...
3 Sloan PLC e7ed1ca0c627c7e1928f4a969225b761133dfb3d9f687f... 1240 Jamie Forks Apt. 590\nc9e009c6a1baa39f305... ['http://587d2a51617478f91f145b2b261629d100768... johnwilliams 29b8f8f54a0405dc20c84f25d2344ddb3fa644459453c1... 65461 Regina Mall Suite 517\nSouth Benjaminbor... 14080d914340529a7af82567e749c5443108be5b9b945e... 8ce998bb9305cb1a1f92f8ce0b17261280cee95c3b4ec6...
4 Smith LLC 6c2264d952abb164d584bb7eba89cf41f94fbba413208c... PSC 9361, Box 5349\nAPO AP 57022 ['https://68bd3a706676d6fbfffbde3da0c7da6164da... amyhernandez 070f7870e5befe421c6dd0e1fd941a34d877b6b4758b90... Unit 7296 Box 6875\nDPO AP b51b9c2cd8d34bc45e1... 66d6d0dac0ba4a8f0e415c24d87448f6d100cb11083f18... 607431ddb916648b0af79f1795e50d8e5512efc7631d77...
5 71f90b7c03aad530eafa769b2ad97cb333ca6dd455fce1... 2d982e60214173e9d9dcace8376a3b33e224374465e289... eace48a195da1586d8c63aae19aa2be8ddd8712c79f29a... ['https://3328a0e4658c855acde71f8d85e98eac5d63... eprice 3b4cef0bb0914adbe31e209262f261589133e5a90f1788... 0248 Cook Mews Apt. 466\nBrownfurt, IN 44282 731feae7800d583626f6fe1ffda859e274ba4f7813c223... a8f11f485ed632115deb5cccd3d251ec901c445c6896d7...
[6]:
<data_describe.privacy.detection.SensitiveDataWidget at 0x7fbdfc19cbd0>

Identify infotypes in each column

[7]:
sensitivewidget.infotypes
[7]:
{'company': ['PERSON'],
 'ssn': ['US_SSN'],
 'residence': ['DATE_TIME', 'LOCATION', 'PERSON'],
 'website': ['DOMAIN_NAME'],
 'username': [],
 'name': ['PERSON'],
 'address': ['DATE_TIME', 'LOCATION', 'PERSON', 'US_DRIVER_LICENSE'],
 'mail': ['DOMAIN_NAME', 'EMAIL_ADDRESS'],
 'birthdate': ['DATE_TIME']}