{"cells": [{"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T02:59:28.240242", "end_time": "2020-10-29T02:59:28.250210", "duration": 0.009968, "status": "completed"}, "tags": []}, "source": "# Sensitive Data"}, {"cell_type": "code", "execution_count": 1, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T02:59:28.278209Z", "iopub.status.busy": "2020-10-29T02:59:28.277210Z", "iopub.status.idle": "2020-10-29T02:59:30.696246Z", "shell.execute_reply": "2020-10-29T02:59:30.696246Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T02:59:28.260209", "end_time": "2020-10-29T02:59:30.697212", "duration": 2.437003, "status": "completed"}, "tags": []}, "outputs": [], "source": "import datetime\nimport pandas as pd\n\nfrom data_describe.privacy.detection import sensitive_data"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T02:59:30.707211", "end_time": "2020-10-29T02:59:30.717268", "duration": 0.010057, "status": "completed"}, "tags": []}, "source": "## Create Sample Profile"}, {"cell_type": "code", "execution_count": 2, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T02:59:30.746243Z", "iopub.status.busy": "2020-10-29T02:59:30.745244Z", "iopub.status.idle": "2020-10-29T02:59:30.753209Z", "shell.execute_reply": "2020-10-29T02:59:30.752209Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T02:59:30.727260", "end_time": "2020-10-29T02:59:30.753209", "duration": 0.025949, "status": "completed"}, "tags": []}, "outputs": [], "source": "sample_profile = {\n \"company\": {\n 0: \"Fisher, Green and Dixon\",\n 1: \"Lawrence, Herring and Riley\",\n 2: \"Thompson-Ruiz\",\n 3: \"Sloan PLC\",\n 4: \"Smith LLC\",\n 5: \"Nolan, Meyers and Johnson\",\n },\n \"ssn\": {\n 0: \"415-39-7809\",\n 1: \"462-64-5856\",\n 2: \"420-73-8333\",\n 3: \"119-33-2186\",\n 4: \"532-38-7349\",\n 5: \"152-33-9873\",\n },\n \"residence\": {\n 0: \"24219 Archer Mountain Suite 924\\nNorth Melissaborough, LA 41945\",\n 1: \"5330 Wilson Fields Suite 560\\nEast Heiditown, VA 70519\",\n 2: \"PSC 5642, Box 8071\\nAPO AA 06490\",\n 3: \"1240 Jamie Forks Apt. 590\\nAlistad, NY 60619\",\n 4: \"PSC 9361, Box 5349\\nAPO AP 57022\",\n 5: \"7118 Williams Flat Apt. 075\\nOwenhaven, LA 50600\",\n },\n \"website\": {\n 0: [\n \"http://sellers.com/\",\n \"https://garrett.com/\",\n \"https://stark.net/\",\n \"http://kaiser.org/\",\n ],\n 1: [\"https://wood-hooper.com/\", \"http://martinez.net/\"],\n 2: [\n \"http://www.arroyo-schultz.biz/\",\n \"https://www.curtis-smith.com/\",\n \"http://www.gray-hutchinson.com/\",\n \"http://www.barnes.com/\",\n ],\n 3: [\"http://hernandez.info/\", \"https://www.williams-martin.org/\"],\n 4: [\"https://www.wilson.com/\"],\n 5: [\"https://mooney.com/\"],\n },\n \"username\": {\n 0: \"sandraharris\",\n 1: \"jeffreylucas\",\n 2: \"karla07\",\n 3: \"johnwilliams\",\n 4: \"amyhernandez\",\n 5: \"eprice\",\n },\n \"name\": {\n 0: \"Doris Martinez\",\n 1: \"Jeffery Garcia\",\n 2: \"Kelsey Freeman\",\n 3: \"Kimberly Carter\",\n 4: \"Charles Gonzalez\",\n 5: \"Roger Olson\",\n },\n \"address\": {\n 0: \"19659 Ivan Stravenue Apt. 471\\nLake Nancyside, VT 71358\",\n 1: \"0916 Michael Row\\nSellersville, WI 08109\",\n 2: \"63812 Morales Ranch Apt. 300\\nLowestad, NM 26520\",\n 3: \"65461 Regina Mall Suite 517\\nSouth Benjaminborough, DE 22331\",\n 4: \"Unit 7296 Box 6875\\nDPO AP 65859\",\n 5: \"0248 Cook Mews Apt. 466\\nBrownfurt, IN 44282\",\n },\n \"mail\": {\n 0: \"mary84@yahoo.com\",\n 1: \"imoore@yahoo.com\",\n 2: \"yramirez@gmail.com\",\n 3: \"nicholas11@hotmail.com\",\n 4: \"nancy89@hotmail.com\",\n 5: \"johnsonrobert@yahoo.com\",\n },\n \"birthdate\": {\n 0: datetime.date(1936, 7, 5),\n 1: datetime.date(1920, 5, 30),\n 2: datetime.date(1958, 6, 13),\n 3: datetime.date(1931, 5, 31),\n 4: datetime.date(1905, 10, 12),\n 5: datetime.date(1986, 5, 21),\n }\n \n}"}, {"cell_type": "code", "execution_count": 3, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T02:59:30.790245Z", "iopub.status.busy": "2020-10-29T02:59:30.789243Z", "iopub.status.idle": "2020-10-29T02:59:30.800244Z", "shell.execute_reply": "2020-10-29T02:59:30.801208Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T02:59:30.764209", "end_time": "2020-10-29T02:59:30.801208", "duration": 0.036999, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": " company ssn \\\n0 Fisher, Green and Dixon 415-39-7809 \n1 Lawrence, Herring and Riley 462-64-5856 \n\n residence \\\n0 24219 Archer Mountain Suite 924\\nNorth Melissa... \n1 5330 Wilson Fields Suite 560\\nEast Heiditown, ... \n\n website username \\\n0 [http://sellers.com/, https://garrett.com/, ht... sandraharris \n1 [https://wood-hooper.com/, http://martinez.net/] jeffreylucas \n\n name address \\\n0 Doris Martinez 19659 Ivan Stravenue Apt. 471\\nLake Nancyside,... \n1 Jeffery Garcia 0916 Michael Row\\nSellersville, WI 08109 \n\n mail birthdate \n0 mary84@yahoo.com 1936-07-05 \n1 imoore@yahoo.com 1920-05-30 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
companyssnresidencewebsiteusernamenameaddressmailbirthdate
0Fisher, Green and Dixon415-39-780924219 Archer Mountain Suite 924\\nNorth Melissa...[http://sellers.com/, https://garrett.com/, ht...sandraharrisDoris Martinez19659 Ivan Stravenue Apt. 471\\nLake Nancyside,...mary84@yahoo.com1936-07-05
1Lawrence, Herring and Riley462-64-58565330 Wilson Fields Suite 560\\nEast Heiditown, ...[https://wood-hooper.com/, http://martinez.net/]jeffreylucasJeffery Garcia0916 Michael Row\\nSellersville, WI 08109imoore@yahoo.com1920-05-30
\n
"}, "execution_count": 3}], "source": "df = pd.DataFrame(sample_profile)\ndf.head(2)"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T02:59:30.813214", "end_time": "2020-10-29T02:59:30.824208", "duration": 0.010994, "status": "completed"}, "tags": []}, "source": "## Redact sensitive data"}, {"cell_type": "code", "execution_count": 4, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T02:59:30.851419Z", "iopub.status.busy": "2020-10-29T02:59:30.850218Z", "iopub.status.idle": "2020-10-29T02:59:52.255112Z", "shell.execute_reply": "2020-10-29T02:59:52.254112Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T02:59:30.835211", "end_time": "2020-10-29T02:59:52.255112", "duration": 21.419901, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "stream", "name": "stderr", "text": "UserWarning: The Dask Engine for Modin is experimental.\n"}, {"output_type": "display_data", "metadata": {}, "data": {"text/plain": " company ssn \\\n0 Fisher, Green and \n1 , and \n2 \n3 Sloan PLC \n4 Smith LLC \n5 , and Johnson \n\n residence \\\n0 24219 Archer Mountain Suite 924\\nNorth Melissa... \n1 5330 Fields Suite 560\\nEast Heiditown... \n2 PSC 5642, Box \\nAPO AA 06490 \n3 Apt. 590\\nAlistad, . 075\\nOwenhaven, \n\n website username name \\\n0 ['http:///', 'https:// \n1 ['https:///', 'http:// \n2 ['http:///', 'https:// \n3 ['http:///', 'https:// \n4 ['https:///'] \n5 ['https:///'] \n\n address mail \\\n0 Apt. 471\\nLake Nancyside,... \n1 0916 Row\\n, \n2 63812 Ranch Apt. 300\\nLowestad, NM \n3 65461 Regina Mall Suite 517\\nSouth Benjaminbor... \n4 Unit Box 6875\\nDPO AP 65859 \n5 Cook Mews Apt. 466\\nBrownfurt, IN ... \n\n birthdate \n0 \n1 \n2 \n3 \n4 -12 \n5 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
companyssnresidencewebsiteusernamenameaddressmailbirthdate
0Fisher, Green and <PERSON><US_SSN>24219 Archer Mountain Suite 924\\nNorth Melissa...['http://<DOMAIN_NAME>/', 'https://<DOMAIN_NAM...<PERSON><PERSON><DATE_TIME> <PERSON> Apt. 471\\nLake Nancyside,...<EMAIL_ADDRESS><DATE_TIME>
1<PERSON>, <PERSON> and <PERSON><US_SSN>5330 <PERSON> Fields Suite 560\\nEast Heiditown...['https://<DOMAIN_NAME>/', 'http://<DOMAIN_NAM...<PERSON><PERSON>0916 <PERSON> Row\\n<US_DRIVER_LICENSE>, <LOCAT...<EMAIL_ADDRESS><DATE_TIME>
2<PERSON><US_SSN>PSC 5642, Box <DATE_TIME>\\nAPO AA 06490['http://<DOMAIN_NAME>/', 'https://<DOMAIN_NAM...karla07<PERSON>63812 <PERSON> Ranch Apt. 300\\nLowestad, NM <D...<EMAIL_ADDRESS><DATE_TIME>
3Sloan PLC<US_SSN><PERSON> Apt. 590\\nAlistad, <LOCATION> <DATE_T...['http://<DOMAIN_NAME>/', 'https://<DOMAIN_NAM...johnwilliams<PERSON>65461 Regina Mall Suite 517\\nSouth Benjaminbor...<EMAIL_ADDRESS><DATE_TIME>
4Smith LLC<US_SSN>PSC 9361, Box 5349\\nAPO AP 57022['https://<DOMAIN_NAME>/']<PERSON><PERSON>Unit <DATE_TIME> Box 6875\\nDPO AP 65859<EMAIL_ADDRESS><DATE_TIME>-12
5<PERSON>, <PERSON> and Johnson<US_SSN><PERSON>. 075\\nOwenhaven, <LOCATION> <DATE_TIME>['https://<DOMAIN_NAME>/']<PERSON><PERSON><DATE_TIME> Cook Mews Apt. 466\\nBrownfurt, IN ...<EMAIL_ADDRESS><DATE_TIME>
\n
"}}, {"output_type": "execute_result", "metadata": {}, "data": {"text/plain": ""}, "execution_count": 4}], "source": "sensitive_data(df, mode='redact', sample_size=len(df))"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T02:59:52.270114", "end_time": "2020-10-29T02:59:52.285112", "duration": 0.014998, "status": "completed"}, "tags": []}, "source": "## Redact sensitive data using selected columns"}, {"cell_type": "code", "execution_count": 5, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T02:59:52.324112Z", "iopub.status.busy": "2020-10-29T02:59:52.323113Z", "iopub.status.idle": "2020-10-29T03:00:12.741149Z", "shell.execute_reply": "2020-10-29T03:00:12.739150Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T02:59:52.300111", "end_time": "2020-10-29T03:00:12.741149", "duration": 20.441038, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "stream", "name": "stderr", "text": "INFO:presidio:nlp_engine not provided. Creating new SpacyNlpEngine instance\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loading NLP model: spaCy en_core_web_lg\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:\u001b[1m\n===================== Info about model 'en_core_web_lg' =====================\u001b[0m\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:\nlang en \nname core_web_lg \nlicense MIT \nauthor Explosion \nurl https://explosion.ai \nemail contact@explosion.ai \ndescription English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities.\nsources [{'name': 'OntoNotes 5', 'url': 'https://catalog.ldc.upenn.edu/LDC2013T19', 'license': 'commercial (licensed by Explosion)'}, {'name': 'Common Crawl'}]\npipeline ['tagger', 'parser', 'ner'] \nversion 2.2.0 \nspacy_version >=2.2.0 \nparent_package spacy \nlabels {'tagger': ['$', \"''\", ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'XX', '_SP', '``'], 'parser': ['ROOT', 'acl', 'acomp', 'advcl', 'advmod', 'agent', 'amod', 'appos', 'attr', 'aux', 'auxpass', 'case', 'cc', 'ccomp', 'compound', 'conj', 'csubj', 'csubjpass', 'dative', 'dep', 'det', 'dobj', 'expl', 'intj', 'mark', 'meta', 'neg', 'nmod', 'npadvmod', 'nsubj', 'nsubjpass', 'nummod', 'oprd', 'parataxis', 'pcomp', 'pobj', 'poss', 'preconj', 'predet', 'prep', 'prt', 'punct', 'quantmod', 'relcl', 'xcomp'], 'ner': ['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']}\nvectors {'width': 300, 'vectors': 684831, 'keys': 684830, 'name': 'en_core_web_lg.vectors'}\nsource C:\\Users\\David\\.conda\\envs\\test-env\\lib\\site-packages\\en_core_web_lg\n\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Printing spaCy model and package details:\n\n {'lang': 'en', 'name': 'core_web_lg', 'license': 'MIT', 'author': 'Explosion', 'url': 'https://explosion.ai', 'email': 'contact@explosion.ai', 'description': 'English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities.', 'sources': [{'name': 'OntoNotes 5', 'url': 'https://catalog.ldc.upenn.edu/LDC2013T19', 'license': 'commercial (licensed by Explosion)'}, {'name': 'Common Crawl'}], 'pipeline': ['tagger', 'parser', 'ner'], 'version': '2.2.0', 'spacy_version': '>=2.2.0', 'parent_package': 'spacy', 'accuracy': {'las': 90.1644260278, 'uas': 91.9835496082, 'token_acc': 99.7579930934, 'tags_acc': 97.2056522464, 'ents_f': 86.3045056111, 'ents_p': 86.217859334, 'ents_r': 86.391326217}, 'speed': {'cpu': 7127.1086034688, 'gpu': None, 'nwords': 291314}, 'labels': {'tagger': ['$', \"''\", ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'XX', '_SP', '``'], 'parser': ['ROOT', 'acl', 'acomp', 'advcl', 'advmod', 'agent', 'amod', 'appos', 'attr', 'aux', 'auxpass', 'case', 'cc', 'ccomp', 'compound', 'conj', 'csubj', 'csubjpass', 'dative', 'dep', 'det', 'dobj', 'expl', 'intj', 'mark', 'meta', 'neg', 'nmod', 'npadvmod', 'nsubj', 'nsubjpass', 'nummod', 'oprd', 'parataxis', 'pcomp', 'pobj', 'poss', 'preconj', 'predet', 'prep', 'prt', 'punct', 'quantmod', 'relcl', 'xcomp'], 'ner': ['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']}, 'vectors': {'width': 300, 'vectors': 684831, 'keys': 684830, 'name': 'en_core_web_lg.vectors'}, 'source': 'C:\\\\Users\\\\David\\\\.conda\\\\envs\\\\test-env\\\\lib\\\\site-packages\\\\en_core_web_lg'}\n\n\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Recognizer registry not provided. Creating default RecognizerRegistry instance\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: CreditCardRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: CryptoRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: DomainRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: EmailRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: IbanRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: IpRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: NhsRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: UsBankRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: UsLicenseRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: UsItinRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: UsPassportRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: UsPhoneRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: UsSsnRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: SpacyRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: SgFinRecognizer\n"}, {"output_type": "display_data", "metadata": {}, "data": {"text/plain": " birthdate mail ssn\n0 \n1 \n2 \n3 \n4 -12 \n5 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
birthdatemailssn
0<DATE_TIME><EMAIL_ADDRESS><US_SSN>
1<DATE_TIME><EMAIL_ADDRESS><US_SSN>
2<DATE_TIME><EMAIL_ADDRESS><US_SSN>
3<DATE_TIME><EMAIL_ADDRESS><US_SSN>
4<DATE_TIME>-12<EMAIL_ADDRESS><US_SSN>
5<DATE_TIME><EMAIL_ADDRESS><US_SSN>
\n
"}}, {"output_type": "execute_result", "metadata": {}, "data": {"text/plain": ""}, "execution_count": 5}], "source": "sensitive_data(df, mode=\"redact\", columns=[\"birthdate\", \"mail\", \"ssn\"], sample_size=len(df), detect_infotypes=True)"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:12.764145", "end_time": "2020-10-29T03:00:12.786122", "duration": 0.021977, "status": "completed"}, "tags": []}, "source": "## Encrypt Data"}, {"cell_type": "code", "execution_count": 6, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:12.867155Z", "iopub.status.busy": "2020-10-29T03:00:12.863157Z", "iopub.status.idle": "2020-10-29T03:00:32.323328Z", "shell.execute_reply": "2020-10-29T03:00:32.323328Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:12.808124", "end_time": "2020-10-29T03:00:32.324321", "duration": 19.516197, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "stream", "name": "stderr", "text": "INFO:presidio:nlp_engine not provided. Creating new SpacyNlpEngine instance\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loading NLP model: spaCy en_core_web_lg\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:\u001b[1m\n===================== Info about model 'en_core_web_lg' =====================\u001b[0m\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:\nlang en \nname core_web_lg \nlicense MIT \nauthor Explosion \nurl https://explosion.ai \nemail contact@explosion.ai \ndescription English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities.\nsources [{'name': 'OntoNotes 5', 'url': 'https://catalog.ldc.upenn.edu/LDC2013T19', 'license': 'commercial (licensed by Explosion)'}, {'name': 'Common Crawl'}]\npipeline ['tagger', 'parser', 'ner'] \nversion 2.2.0 \nspacy_version >=2.2.0 \nparent_package spacy \nlabels {'tagger': ['$', \"''\", ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'XX', '_SP', '``'], 'parser': ['ROOT', 'acl', 'acomp', 'advcl', 'advmod', 'agent', 'amod', 'appos', 'attr', 'aux', 'auxpass', 'case', 'cc', 'ccomp', 'compound', 'conj', 'csubj', 'csubjpass', 'dative', 'dep', 'det', 'dobj', 'expl', 'intj', 'mark', 'meta', 'neg', 'nmod', 'npadvmod', 'nsubj', 'nsubjpass', 'nummod', 'oprd', 'parataxis', 'pcomp', 'pobj', 'poss', 'preconj', 'predet', 'prep', 'prt', 'punct', 'quantmod', 'relcl', 'xcomp'], 'ner': ['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']}\nvectors {'width': 300, 'vectors': 684831, 'keys': 684830, 'name': 'en_core_web_lg.vectors'}\nsource C:\\Users\\David\\.conda\\envs\\test-env\\lib\\site-packages\\en_core_web_lg\n\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Printing spaCy model and package details:\n\n {'lang': 'en', 'name': 'core_web_lg', 'license': 'MIT', 'author': 'Explosion', 'url': 'https://explosion.ai', 'email': 'contact@explosion.ai', 'description': 'English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities.', 'sources': [{'name': 'OntoNotes 5', 'url': 'https://catalog.ldc.upenn.edu/LDC2013T19', 'license': 'commercial (licensed by Explosion)'}, {'name': 'Common Crawl'}], 'pipeline': ['tagger', 'parser', 'ner'], 'version': '2.2.0', 'spacy_version': '>=2.2.0', 'parent_package': 'spacy', 'accuracy': {'las': 90.1644260278, 'uas': 91.9835496082, 'token_acc': 99.7579930934, 'tags_acc': 97.2056522464, 'ents_f': 86.3045056111, 'ents_p': 86.217859334, 'ents_r': 86.391326217}, 'speed': {'cpu': 7127.1086034688, 'gpu': None, 'nwords': 291314}, 'labels': {'tagger': ['$', \"''\", ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'XX', '_SP', '``'], 'parser': ['ROOT', 'acl', 'acomp', 'advcl', 'advmod', 'agent', 'amod', 'appos', 'attr', 'aux', 'auxpass', 'case', 'cc', 'ccomp', 'compound', 'conj', 'csubj', 'csubjpass', 'dative', 'dep', 'det', 'dobj', 'expl', 'intj', 'mark', 'meta', 'neg', 'nmod', 'npadvmod', 'nsubj', 'nsubjpass', 'nummod', 'oprd', 'parataxis', 'pcomp', 'pobj', 'poss', 'preconj', 'predet', 'prep', 'prt', 'punct', 'quantmod', 'relcl', 'xcomp'], 'ner': ['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']}, 'vectors': {'width': 300, 'vectors': 684831, 'keys': 684830, 'name': 'en_core_web_lg.vectors'}, 'source': 'C:\\\\Users\\\\David\\\\.conda\\\\envs\\\\test-env\\\\lib\\\\site-packages\\\\en_core_web_lg'}\n\n\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Recognizer registry not provided. Creating default RecognizerRegistry instance\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: CreditCardRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: CryptoRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: DomainRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: EmailRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: IbanRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: IpRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: NhsRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: UsBankRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: UsLicenseRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: UsItinRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: UsPassportRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: UsPhoneRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: UsSsnRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: SpacyRecognizer\n"}, {"output_type": "stream", "name": "stderr", "text": "INFO:presidio:Loaded recognizer: SgFinRecognizer\n"}, {"output_type": "display_data", "metadata": {}, "data": {"text/plain": " company \\\n0 Fisher, Green and 7e0b7a468d5cbb313ac238ae4942... \n1 76364d81ab4c1832d9181572cc0d408f7bdd3900a61113... \n2 4bf5f01b21094aceb8d03ea384cc90e0c163898704dc2b... \n3 Sloan PLC \n4 Smith LLC \n5 71f90b7c03aad530eafa769b2ad97cb333ca6dd455fce1... \n\n ssn \\\n0 9fc2a689acf5f89d888c7acb3d43add7c4ddb320ca0fe0... \n1 567e4a168b80d74c435b22ec74bf56f52d1480372ee3db... \n2 06f11d164b97a22cd2a3a2a1a1b6df50268a8a21ee8a97... \n3 e7ed1ca0c627c7e1928f4a969225b761133dfb3d9f687f... \n4 6c2264d952abb164d584bb7eba89cf41f94fbba413208c... \n5 2d982e60214173e9d9dcace8376a3b33e224374465e289... \n\n residence \\\n0 24219 Archer Mountain Suite 924\\nNorth Melissa... \n1 5330 fb7961d139e4da12af18c24571a166fb77c391a15... \n2 PSC 5642, Box e58bb6dcf3948e6c37eefaf185c769e0... \n3 2ebf4a7fdd9a4209fd1b52d9b672bccbf65e196f62169f... \n4 PSC 9361, Box 5349\\nAPO AP 57022 \n5 824d23478ea9d0d571699e83112a567183dbfde775f2f4... \n\n website \\\n0 ['http://7f6bcdcd8eb28186cd2bd93cdd66bf2abba0f... \n1 ['https://b5e885e576b91d7f332cecd36e1078e1c8ff... \n2 ['http://7aad9afb51f77260b55915f74a5498b5850cc... \n3 ['http://587d2a51617478f91f145b2b261629d100768... \n4 ['https://68bd3a706676d6fbfffbde3da0c7da6164da... \n5 ['https://3328a0e4658c855acde71f8d85e98eac5d63... \n\n username \\\n0 83238faab748c22d8d84203bae0628716e31bd00891ecc... \n1 cf7bfbcaf80510fb918e37ad52ff7e9b59e86308a7e9c5... \n2 karla07 \n3 johnwilliams \n4 66565c13d4869eda2a02c506c1b3d7e65c769ac209bd00... \n5 eb0f1bd37f4ba7e4ecb33aeec2423538e088530eff1cea... \n\n name \\\n0 779b878ad9f68899e5de5daac8322dd5b3a3991b10e3ed... \n1 b79d0309ea6eb45b8e8dee995c996184641de208262b0b... \n2 08471f7bf132fdfb2a1a9ad7529fca6942425dc9d32834... \n3 29b8f8f54a0405dc20c84f25d2344ddb3fa644459453c1... \n4 070f7870e5befe421c6dd0e1fd941a34d877b6b4758b90... \n5 3b4cef0bb0914adbe31e209262f261589133e5a90f1788... \n\n address \\\n0 26f70e334556abc2841f5312a9984aa9a7f21b1924015d... \n1 0916 f089eaef57aba315bc0e1455985c0c8e40c247f07... \n2 63812 6f005a18689861aa2634b7a3155a148a56fcd272... \n3 65461 Regina Mall Suite 517\\nSouth Benjaminbor... \n4 Unit 63bfc4219c16f9ba618b6b76d81edef58b912ca65... \n5 912b3255f7a8016fbf71bd4b2f7ffb646945200bb4b63c... \n\n mail \\\n0 b43ca3c683dabab5b42fee6181836eeea80dad950c1428... \n1 42ea5e56df211dbed485dc611277fe0932bb3139c4815e... \n2 4ca073222470748ed4fb77ad0011402af414dd561cd645... \n3 14080d914340529a7af82567e749c5443108be5b9b945e... \n4 66d6d0dac0ba4a8f0e415c24d87448f6d100cb11083f18... \n5 731feae7800d583626f6fe1ffda859e274ba4f7813c223... \n\n birthdate \n0 bd4b000796139f1fe01d62a9e6ccc8d50c8ca861fedd9c... \n1 7e7bebf19314f96de0b1231c7c89627c3a87dd110c2614... \n2 1b739ba212c2fec7848fbe6cffeffeb2f3737f40748542... \n3 8ce998bb9305cb1a1f92f8ce0b17261280cee95c3b4ec6... \n4 e80ef1d1d41d0b4b0bc14a77236ee6018cc31db6e9aa07... \n5 a8f11f485ed632115deb5cccd3d251ec901c445c6896d7... ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
companyssnresidencewebsiteusernamenameaddressmailbirthdate
0Fisher, Green and 7e0b7a468d5cbb313ac238ae4942...9fc2a689acf5f89d888c7acb3d43add7c4ddb320ca0fe0...24219 Archer Mountain Suite 924\\nNorth Melissa...['http://7f6bcdcd8eb28186cd2bd93cdd66bf2abba0f...83238faab748c22d8d84203bae0628716e31bd00891ecc...779b878ad9f68899e5de5daac8322dd5b3a3991b10e3ed...26f70e334556abc2841f5312a9984aa9a7f21b1924015d...b43ca3c683dabab5b42fee6181836eeea80dad950c1428...bd4b000796139f1fe01d62a9e6ccc8d50c8ca861fedd9c...
176364d81ab4c1832d9181572cc0d408f7bdd3900a61113...567e4a168b80d74c435b22ec74bf56f52d1480372ee3db...5330 fb7961d139e4da12af18c24571a166fb77c391a15...['https://b5e885e576b91d7f332cecd36e1078e1c8ff...cf7bfbcaf80510fb918e37ad52ff7e9b59e86308a7e9c5...b79d0309ea6eb45b8e8dee995c996184641de208262b0b...0916 f089eaef57aba315bc0e1455985c0c8e40c247f07...42ea5e56df211dbed485dc611277fe0932bb3139c4815e...7e7bebf19314f96de0b1231c7c89627c3a87dd110c2614...
24bf5f01b21094aceb8d03ea384cc90e0c163898704dc2b...06f11d164b97a22cd2a3a2a1a1b6df50268a8a21ee8a97...PSC 5642, Box e58bb6dcf3948e6c37eefaf185c769e0...['http://7aad9afb51f77260b55915f74a5498b5850cc...karla0708471f7bf132fdfb2a1a9ad7529fca6942425dc9d32834...63812 6f005a18689861aa2634b7a3155a148a56fcd272...4ca073222470748ed4fb77ad0011402af414dd561cd645...1b739ba212c2fec7848fbe6cffeffeb2f3737f40748542...
3Sloan PLCe7ed1ca0c627c7e1928f4a969225b761133dfb3d9f687f...2ebf4a7fdd9a4209fd1b52d9b672bccbf65e196f62169f...['http://587d2a51617478f91f145b2b261629d100768...johnwilliams29b8f8f54a0405dc20c84f25d2344ddb3fa644459453c1...65461 Regina Mall Suite 517\\nSouth Benjaminbor...14080d914340529a7af82567e749c5443108be5b9b945e...8ce998bb9305cb1a1f92f8ce0b17261280cee95c3b4ec6...
4Smith LLC6c2264d952abb164d584bb7eba89cf41f94fbba413208c...PSC 9361, Box 5349\\nAPO AP 57022['https://68bd3a706676d6fbfffbde3da0c7da6164da...66565c13d4869eda2a02c506c1b3d7e65c769ac209bd00...070f7870e5befe421c6dd0e1fd941a34d877b6b4758b90...Unit 63bfc4219c16f9ba618b6b76d81edef58b912ca65...66d6d0dac0ba4a8f0e415c24d87448f6d100cb11083f18...e80ef1d1d41d0b4b0bc14a77236ee6018cc31db6e9aa07...
571f90b7c03aad530eafa769b2ad97cb333ca6dd455fce1...2d982e60214173e9d9dcace8376a3b33e224374465e289...824d23478ea9d0d571699e83112a567183dbfde775f2f4...['https://3328a0e4658c855acde71f8d85e98eac5d63...eb0f1bd37f4ba7e4ecb33aeec2423538e088530eff1cea...3b4cef0bb0914adbe31e209262f261589133e5a90f1788...912b3255f7a8016fbf71bd4b2f7ffb646945200bb4b63c...731feae7800d583626f6fe1ffda859e274ba4f7813c223...a8f11f485ed632115deb5cccd3d251ec901c445c6896d7...
\n
"}}, {"output_type": "execute_result", "metadata": {}, "data": {"text/plain": ""}, "execution_count": 6}], "source": "# encrypt with SHA256\nsensitivewidget = sensitive_data(df, mode=\"encrypt\", sample_size=len(df))\nsensitivewidget"}, {"cell_type": "markdown", "metadata": {"papermill": {"exception": false, "start_time": "2020-10-29T03:00:32.370292", "end_time": "2020-10-29T03:00:32.405293", "duration": 0.035001, "status": "completed"}, "tags": []}, "source": "## Identify infotypes in each column"}, {"cell_type": "code", "execution_count": 7, "metadata": {"execution": {"iopub.execute_input": "2020-10-29T03:00:32.490296Z", "iopub.status.busy": "2020-10-29T03:00:32.489305Z", "iopub.status.idle": "2020-10-29T03:00:32.495319Z", "shell.execute_reply": "2020-10-29T03:00:32.495319Z"}, "papermill": {"exception": false, "start_time": "2020-10-29T03:00:32.444293", "end_time": "2020-10-29T03:00:32.496319", "duration": 0.052026, "status": "completed"}, "tags": []}, "outputs": [{"output_type": "execute_result", "metadata": {}, "data": {"text/plain": "{'company': ['PERSON'],\n 'ssn': ['US_SSN'],\n 'residence': ['DATE_TIME', 'LOCATION', 'PERSON'],\n 'website': ['DOMAIN_NAME', 'PERSON'],\n 'username': ['PERSON'],\n 'name': ['PERSON'],\n 'address': ['DATE_TIME', 'LOCATION', 'PERSON', 'US_DRIVER_LICENSE'],\n 'mail': ['DOMAIN_NAME', 'EMAIL_ADDRESS'],\n 'birthdate': ['DATE_TIME']}"}, "execution_count": 7}], "source": "sensitivewidget.infotypes"}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"name": "python", "version": "3.7.9", "mimetype": "text/x-python", "codemirror_mode": {"name": "ipython", "version": 3}, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py"}, "papermill": {"duration": 72.116237, "end_time": "2020-10-29T03:00:38.578100", "environment_variables": {}, "exception": null, "input_path": "C:\\workspace\\data-describe\\examples\\Sensitive_Data.ipynb", "output_path": "C:\\workspace\\data-describe\\examples\\Sensitive_Data.ipynb", "parameters": {}, "start_time": "2020-10-29T02:59:26.461863", "version": "2.1.2"}}, "nbformat": 4, "nbformat_minor": 4}