{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Sensitive Data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import datetime\n", "import pandas as pd\n", "\n", "from data_describe.privacy.detection import sensitive_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create Sample Profile" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "sample_profile = {\n", " \"company\": {\n", " 0: \"Fisher, Green and Dixon\",\n", " 1: \"Lawrence, Herring and Riley\",\n", " 2: \"Thompson-Ruiz\",\n", " 3: \"Sloan PLC\",\n", " 4: \"Smith LLC\",\n", " 5: \"Nolan, Meyers and Johnson\",\n", " },\n", " \"ssn\": {\n", " 0: \"415-39-7809\",\n", " 1: \"462-64-5856\",\n", " 2: \"420-73-8333\",\n", " 3: \"119-33-2186\",\n", " 4: \"532-38-7349\",\n", " 5: \"152-33-9873\",\n", " },\n", " \"residence\": {\n", " 0: \"24219 Archer Mountain Suite 924\\nNorth Melissaborough, LA 41945\",\n", " 1: \"5330 Wilson Fields Suite 560\\nEast Heiditown, VA 70519\",\n", " 2: \"PSC 5642, Box 8071\\nAPO AA 06490\",\n", " 3: \"1240 Jamie Forks Apt. 590\\nAlistad, NY 60619\",\n", " 4: \"PSC 9361, Box 5349\\nAPO AP 57022\",\n", " 5: \"7118 Williams Flat Apt. 075\\nOwenhaven, LA 50600\",\n", " },\n", " \"website\": {\n", " 0: [\n", " \"http://sellers.com/\",\n", " \"https://garrett.com/\",\n", " \"https://stark.net/\",\n", " \"http://kaiser.org/\",\n", " ],\n", " 1: [\"https://wood-hooper.com/\", \"http://martinez.net/\"],\n", " 2: [\n", " \"http://www.arroyo-schultz.biz/\",\n", " \"https://www.curtis-smith.com/\",\n", " \"http://www.gray-hutchinson.com/\",\n", " \"http://www.barnes.com/\",\n", " ],\n", " 3: [\"http://hernandez.info/\", \"https://www.williams-martin.org/\"],\n", " 4: [\"https://www.wilson.com/\"],\n", " 5: [\"https://mooney.com/\"],\n", " },\n", " \"username\": {\n", " 0: \"sandraharris\",\n", " 1: \"jeffreylucas\",\n", " 2: \"karla07\",\n", " 3: \"johnwilliams\",\n", " 4: \"amyhernandez\",\n", " 5: \"eprice\",\n", " },\n", " \"name\": {\n", " 0: \"Doris Martinez\",\n", " 1: \"Jeffery Garcia\",\n", " 2: \"Kelsey Freeman\",\n", " 3: \"Kimberly Carter\",\n", " 4: \"Charles Gonzalez\",\n", " 5: \"Roger Olson\",\n", " },\n", " \"address\": {\n", " 0: \"19659 Ivan Stravenue Apt. 471\\nLake Nancyside, VT 71358\",\n", " 1: \"0916 Michael Row\\nSellersville, WI 08109\",\n", " 2: \"63812 Morales Ranch Apt. 300\\nLowestad, NM 26520\",\n", " 3: \"65461 Regina Mall Suite 517\\nSouth Benjaminborough, DE 22331\",\n", " 4: \"Unit 7296 Box 6875\\nDPO AP 65859\",\n", " 5: \"0248 Cook Mews Apt. 466\\nBrownfurt, IN 44282\",\n", " },\n", " \"mail\": {\n", " 0: \"mary84@yahoo.com\",\n", " 1: \"imoore@yahoo.com\",\n", " 2: \"yramirez@gmail.com\",\n", " 3: \"nicholas11@hotmail.com\",\n", " 4: \"nancy89@hotmail.com\",\n", " 5: \"johnsonrobert@yahoo.com\",\n", " },\n", " \"birthdate\": {\n", " 0: datetime.date(1936, 7, 5),\n", " 1: datetime.date(1920, 5, 30),\n", " 2: datetime.date(1958, 6, 13),\n", " 3: datetime.date(1931, 5, 31),\n", " 4: datetime.date(1905, 10, 12),\n", " 5: datetime.date(1986, 5, 21),\n", " }\n", " \n", "}" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
companyssnresidencewebsiteusernamenameaddressmailbirthdate
0Fisher, Green and Dixon415-39-780924219 Archer Mountain Suite 924\\nNorth Melissa...[http://sellers.com/, https://garrett.com/, ht...sandraharrisDoris Martinez19659 Ivan Stravenue Apt. 471\\nLake Nancyside,...mary84@yahoo.com1936-07-05
1Lawrence, Herring and Riley462-64-58565330 Wilson Fields Suite 560\\nEast Heiditown, ...[https://wood-hooper.com/, http://martinez.net/]jeffreylucasJeffery Garcia0916 Michael Row\\nSellersville, WI 08109imoore@yahoo.com1920-05-30
\n", "
" ], "text/plain": [ " company ssn \\\n", "0 Fisher, Green and Dixon 415-39-7809 \n", "1 Lawrence, Herring and Riley 462-64-5856 \n", "\n", " residence \\\n", "0 24219 Archer Mountain Suite 924\\nNorth Melissa... \n", "1 5330 Wilson Fields Suite 560\\nEast Heiditown, ... \n", "\n", " website username \\\n", "0 [http://sellers.com/, https://garrett.com/, ht... sandraharris \n", "1 [https://wood-hooper.com/, http://martinez.net/] jeffreylucas \n", "\n", " name address \\\n", "0 Doris Martinez 19659 Ivan Stravenue Apt. 471\\nLake Nancyside,... \n", "1 Jeffery Garcia 0916 Michael Row\\nSellersville, WI 08109 \n", "\n", " mail birthdate \n", "0 mary84@yahoo.com 1936-07-05 \n", "1 imoore@yahoo.com 1920-05-30 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(sample_profile)\n", "df.head(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Redact sensitive data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
companyssnresidencewebsiteusernamenameaddressmailbirthdate
0<PERSON>, <PERSON> and <PERSON><US_SSN>24219 <LOCATION> Suite 924\\nNorth Melissaborou...['http://<DOMAIN_NAME>/', 'https://<DOMAIN_NAM...sandraharris<PERSON><DATE_TIME> <PERSON> Apt. 471\\nLake Nancyside,...<EMAIL_ADDRESS><DATE_TIME>
1<PERSON>, <PERSON> and <PERSON><US_SSN>5330 Wilson Fields Suite 560\\n<LOCATION>, <LOC...['https://<DOMAIN_NAME>/', 'http://<DOMAIN_NAM...jeffreylucas<PERSON><DATE_TIME><US_DRIVER_LICENSE>, <LOCATION> 08109<EMAIL_ADDRESS><DATE_TIME>
2Thompson-Ruiz<US_SSN>PSC 5642, Box 8071\\nAPO AA 06490['http://<DOMAIN_NAME>/', 'https://<DOMAIN_NAM...karla07<PERSON><DATE_TIME> <PERSON> Ranch Apt. 300\\n<PERSON>,...<EMAIL_ADDRESS><DATE_TIME>
3Sloan PLC<US_SSN>1240 Jamie Forks Apt. 590\\n<PERSON>, <LOCATION...['http://<DOMAIN_NAME>/', 'https://<DOMAIN_NAM...johnwilliams<PERSON>65461 Regina Mall Suite 517\\nSouth Benjaminbor...<EMAIL_ADDRESS><DATE_TIME>
4Smith LLC<US_SSN>PSC 9361, Box 5349\\nAPO AP 57022['https://<DOMAIN_NAME>/']amyhernandez<PERSON>Unit 7296 Box 6875\\nDPO AP <DATE_TIME><EMAIL_ADDRESS><DATE_TIME>
5<PERSON>, <PERSON> and <PERSON><US_SSN><DATE_TIME> <PERSON>. 075\\nOwenhaven, LA 50600['https://<DOMAIN_NAME>/']eprice<PERSON>0248 Cook Mews Apt. 466\\nBrownfurt, IN 44282<EMAIL_ADDRESS><DATE_TIME>
\n", "
" ], "text/plain": [ " company ssn \\\n", "0 , and \n", "1 , and \n", "2 Thompson-Ruiz \n", "3 Sloan PLC \n", "4 Smith LLC \n", "5 , and \n", "\n", " residence \\\n", "0 24219 Suite 924\\nNorth Melissaborou... \n", "1 5330 Wilson Fields Suite 560\\n, , . 075\\nOwenhaven, LA 50600 \n", "\n", " website username name \\\n", "0 ['http:///', 'https:// \n", "1 ['https:///', 'http:// \n", "2 ['http:///', 'https:// \n", "3 ['http:///', 'https:// \n", "4 ['https:///'] amyhernandez \n", "5 ['https:///'] eprice \n", "\n", " address mail \\\n", "0 Apt. 471\\nLake Nancyside,... \n", "1 , 08109 \n", "2 Ranch Apt. 300\\n,... \n", "3 65461 Regina Mall Suite 517\\nSouth Benjaminbor... \n", "4 Unit 7296 Box 6875\\nDPO AP \n", "5 0248 Cook Mews Apt. 466\\nBrownfurt, IN 44282 \n", "\n", " birthdate \n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", "5 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sensitive_data(df, mode='redact', sample_size=len(df))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Redact sensitive data using selected columns" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
birthdatemailssn
0<DATE_TIME><EMAIL_ADDRESS><US_SSN>
1<DATE_TIME><EMAIL_ADDRESS><US_SSN>
2<DATE_TIME><EMAIL_ADDRESS><US_SSN>
3<DATE_TIME><EMAIL_ADDRESS><US_SSN>
4<DATE_TIME><EMAIL_ADDRESS><US_SSN>
5<DATE_TIME><EMAIL_ADDRESS><US_SSN>
\n", "
" ], "text/plain": [ " birthdate mail ssn\n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", "5 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sensitive_data(df, mode=\"redact\", columns=[\"birthdate\", \"mail\", \"ssn\"], sample_size=len(df), detect_infotypes=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Encrypt Data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
companyssnresidencewebsiteusernamenameaddressmailbirthdate
0ea49f5ee27409740ce7f74997ccf0184f3bbe9a72a86c0...9fc2a689acf5f89d888c7acb3d43add7c4ddb320ca0fe0...24219 2cee30d46d5a9c636d0ce08afea6aaba9b48cd9d...['http://7f6bcdcd8eb28186cd2bd93cdd66bf2abba0f...sandraharris779b878ad9f68899e5de5daac8322dd5b3a3991b10e3ed...26f70e334556abc2841f5312a9984aa9a7f21b1924015d...b43ca3c683dabab5b42fee6181836eeea80dad950c1428...bd4b000796139f1fe01d62a9e6ccc8d50c8ca861fedd9c...
176364d81ab4c1832d9181572cc0d408f7bdd3900a61113...567e4a168b80d74c435b22ec74bf56f52d1480372ee3db...5330 Wilson Fields Suite 560\\n2ab6e79e04018666...['https://b5e885e576b91d7f332cecd36e1078e1c8ff...jeffreylucasb79d0309ea6eb45b8e8dee995c996184641de208262b0b...b6e3258d0f1106cf9ac124221f4bbdd7c327eb5dcb5880...42ea5e56df211dbed485dc611277fe0932bb3139c4815e...7e7bebf19314f96de0b1231c7c89627c3a87dd110c2614...
2Thompson-Ruiz06f11d164b97a22cd2a3a2a1a1b6df50268a8a21ee8a97...PSC 5642, Box 8071\\nAPO AA 06490['http://7aad9afb51f77260b55915f74a5498b5850cc...karla0708471f7bf132fdfb2a1a9ad7529fca6942425dc9d32834...51ae0e05ffebf0ab306c50cfaa433765e089e781ddbfaf...4ca073222470748ed4fb77ad0011402af414dd561cd645...1b739ba212c2fec7848fbe6cffeffeb2f3737f40748542...
3Sloan PLCe7ed1ca0c627c7e1928f4a969225b761133dfb3d9f687f...1240 Jamie Forks Apt. 590\\nc9e009c6a1baa39f305...['http://587d2a51617478f91f145b2b261629d100768...johnwilliams29b8f8f54a0405dc20c84f25d2344ddb3fa644459453c1...65461 Regina Mall Suite 517\\nSouth Benjaminbor...14080d914340529a7af82567e749c5443108be5b9b945e...8ce998bb9305cb1a1f92f8ce0b17261280cee95c3b4ec6...
4Smith LLC6c2264d952abb164d584bb7eba89cf41f94fbba413208c...PSC 9361, Box 5349\\nAPO AP 57022['https://68bd3a706676d6fbfffbde3da0c7da6164da...amyhernandez070f7870e5befe421c6dd0e1fd941a34d877b6b4758b90...Unit 7296 Box 6875\\nDPO AP b51b9c2cd8d34bc45e1...66d6d0dac0ba4a8f0e415c24d87448f6d100cb11083f18...607431ddb916648b0af79f1795e50d8e5512efc7631d77...
571f90b7c03aad530eafa769b2ad97cb333ca6dd455fce1...2d982e60214173e9d9dcace8376a3b33e224374465e289...eace48a195da1586d8c63aae19aa2be8ddd8712c79f29a...['https://3328a0e4658c855acde71f8d85e98eac5d63...eprice3b4cef0bb0914adbe31e209262f261589133e5a90f1788...0248 Cook Mews Apt. 466\\nBrownfurt, IN 44282731feae7800d583626f6fe1ffda859e274ba4f7813c223...a8f11f485ed632115deb5cccd3d251ec901c445c6896d7...
\n", "
" ], "text/plain": [ " company \\\n", "0 ea49f5ee27409740ce7f74997ccf0184f3bbe9a72a86c0... \n", "1 76364d81ab4c1832d9181572cc0d408f7bdd3900a61113... \n", "2 Thompson-Ruiz \n", "3 Sloan PLC \n", "4 Smith LLC \n", "5 71f90b7c03aad530eafa769b2ad97cb333ca6dd455fce1... \n", "\n", " ssn \\\n", "0 9fc2a689acf5f89d888c7acb3d43add7c4ddb320ca0fe0... \n", "1 567e4a168b80d74c435b22ec74bf56f52d1480372ee3db... \n", "2 06f11d164b97a22cd2a3a2a1a1b6df50268a8a21ee8a97... \n", "3 e7ed1ca0c627c7e1928f4a969225b761133dfb3d9f687f... \n", "4 6c2264d952abb164d584bb7eba89cf41f94fbba413208c... \n", "5 2d982e60214173e9d9dcace8376a3b33e224374465e289... \n", "\n", " residence \\\n", "0 24219 2cee30d46d5a9c636d0ce08afea6aaba9b48cd9d... \n", "1 5330 Wilson Fields Suite 560\\n2ab6e79e04018666... \n", "2 PSC 5642, Box 8071\\nAPO AA 06490 \n", "3 1240 Jamie Forks Apt. 590\\nc9e009c6a1baa39f305... \n", "4 PSC 9361, Box 5349\\nAPO AP 57022 \n", "5 eace48a195da1586d8c63aae19aa2be8ddd8712c79f29a... \n", "\n", " website username \\\n", "0 ['http://7f6bcdcd8eb28186cd2bd93cdd66bf2abba0f... sandraharris \n", "1 ['https://b5e885e576b91d7f332cecd36e1078e1c8ff... jeffreylucas \n", "2 ['http://7aad9afb51f77260b55915f74a5498b5850cc... karla07 \n", "3 ['http://587d2a51617478f91f145b2b261629d100768... johnwilliams \n", "4 ['https://68bd3a706676d6fbfffbde3da0c7da6164da... amyhernandez \n", "5 ['https://3328a0e4658c855acde71f8d85e98eac5d63... eprice \n", "\n", " name \\\n", "0 779b878ad9f68899e5de5daac8322dd5b3a3991b10e3ed... \n", "1 b79d0309ea6eb45b8e8dee995c996184641de208262b0b... \n", "2 08471f7bf132fdfb2a1a9ad7529fca6942425dc9d32834... \n", "3 29b8f8f54a0405dc20c84f25d2344ddb3fa644459453c1... \n", "4 070f7870e5befe421c6dd0e1fd941a34d877b6b4758b90... \n", "5 3b4cef0bb0914adbe31e209262f261589133e5a90f1788... \n", "\n", " address \\\n", "0 26f70e334556abc2841f5312a9984aa9a7f21b1924015d... \n", "1 b6e3258d0f1106cf9ac124221f4bbdd7c327eb5dcb5880... \n", "2 51ae0e05ffebf0ab306c50cfaa433765e089e781ddbfaf... \n", "3 65461 Regina Mall Suite 517\\nSouth Benjaminbor... \n", "4 Unit 7296 Box 6875\\nDPO AP b51b9c2cd8d34bc45e1... \n", "5 0248 Cook Mews Apt. 466\\nBrownfurt, IN 44282 \n", "\n", " mail \\\n", "0 b43ca3c683dabab5b42fee6181836eeea80dad950c1428... \n", "1 42ea5e56df211dbed485dc611277fe0932bb3139c4815e... \n", "2 4ca073222470748ed4fb77ad0011402af414dd561cd645... \n", "3 14080d914340529a7af82567e749c5443108be5b9b945e... \n", "4 66d6d0dac0ba4a8f0e415c24d87448f6d100cb11083f18... \n", "5 731feae7800d583626f6fe1ffda859e274ba4f7813c223... \n", "\n", " birthdate \n", "0 bd4b000796139f1fe01d62a9e6ccc8d50c8ca861fedd9c... \n", "1 7e7bebf19314f96de0b1231c7c89627c3a87dd110c2614... \n", "2 1b739ba212c2fec7848fbe6cffeffeb2f3737f40748542... \n", "3 8ce998bb9305cb1a1f92f8ce0b17261280cee95c3b4ec6... \n", "4 607431ddb916648b0af79f1795e50d8e5512efc7631d77... \n", "5 a8f11f485ed632115deb5cccd3d251ec901c445c6896d7... " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# encrypt with SHA256\n", "sensitivewidget = sensitive_data(df, mode=\"encrypt\", sample_size=len(df))\n", "sensitivewidget" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Identify infotypes in each column" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'company': ['PERSON'],\n", " 'ssn': ['US_SSN'],\n", " 'residence': ['DATE_TIME', 'LOCATION', 'PERSON'],\n", " 'website': ['DOMAIN_NAME'],\n", " 'username': [],\n", " 'name': ['PERSON'],\n", " 'address': ['DATE_TIME', 'LOCATION', 'PERSON', 'US_DRIVER_LICENSE'],\n", " 'mail': ['DOMAIN_NAME', 'EMAIL_ADDRESS'],\n", " 'birthdate': ['DATE_TIME']}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sensitivewidget.infotypes" ] } ], "metadata": { "kernelspec": { "display_name": "datadescribe", "language": "python", "name": "datadescribe" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7-final" } }, "nbformat": 4, "nbformat_minor": 4 }