{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Getting Started with sanityze" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## If you are developing locally\n", "\n", "After checking out the repository, you can run the following commands to install the package and its dependencies:\n", "\n", "```bash\n", "$ poetry install\n", "```\n", "\n", "Open this notebook in JupyterLab and run the cells in section \"Example\"." ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## If you are using the public package\n", "\n", "To use `sanityze` in a project, You can install it from PyPI:\n", "\n", "```bash\n", "$ pip install sanityze\n", "```\n", "\n", "Then, you can import it in your code:\n", "\n", "```python\n", "from sanityze.cleanser import *\n", "from sanityze.spotters import *\n", "```\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Examples" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
product_nameprice
0laptop1200
1printer foo@gaga.com150
2tablet300
3desk 5555 5555 5555 4444450
4chair200
\n", "
" ], "text/plain": [ " product_name price\n", "0 laptop 1200\n", "1 printer foo@gaga.com 150\n", "2 tablet 300\n", "3 desk 5555 5555 5555 4444 450\n", "4 chair 200" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# setup a dummy test dataframe\n", "import pandas as pd\n", "data = {'product_name': ['laptop', 'printer foo@gaga.com', 'tablet', 'desk 5555 5555 5555 4444', 'chair'],\n", " 'price': [1200, 150, 300, 450, 200]\n", " }\n", "df = pd.DataFrame(data)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "- EMAILADDRS: Processing cell laptop \n", "- EMAILADDRS: Processed cell laptop \n", "- CREDITCARD: Processing cell laptop \n", "- CREDITCARD: Processed cell laptop \n", "- EMAILADDRS: Processing cell printer foo@gaga.com \n", "- EMAILADDRS: Processed cell printer EMAILADDRS \n", "- CREDITCARD: Processing cell printer EMAILADDRS \n", "- CREDITCARD: Processed cell printer EMAILADDRS \n", "- EMAILADDRS: Processing cell tablet \n", "- EMAILADDRS: Processed cell tablet \n", "- CREDITCARD: Processing cell tablet \n", "- CREDITCARD: Processed cell tablet \n", "- EMAILADDRS: Processing cell desk 5555 5555 5555 4444 \n", "- EMAILADDRS: Processed cell desk 5555 5555 5555 4444 \n", "- CREDITCARD: Processing cell desk 5555 5555 5555 4444 \n", "- CREDITCARD: Processed cell desk 5555 5555 5555 4444 \n", "- EMAILADDRS: Processing cell chair \n", "- EMAILADDRS: Processed cell chair \n", "- CREDITCARD: Processing cell chair \n", "- CREDITCARD: Processed cell chair \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
product_nameprice
0laptop1200
1printer EMAILADDRS150
2tablet300
3desk 5555 5555 5555 4444450
4chair200
\n", "
" ], "text/plain": [ " product_name price\n", "0 laptop 1200\n", "1 printer EMAILADDRS 150\n", "2 tablet 300\n", "3 desk 5555 5555 5555 4444 450\n", "4 chair 200" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sanityze.cleanser import *\n", "c = Cleanser()\n", "c.clean(df, verbose=False)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Testing with dummy data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameemail_addressvisa_ccmaster_ccbalanceactive_memberage
0JacobKingthe following is my email address JacobKing100...this is my credit card: 46584813986029205339168719695860100124
1ChloeLavoiethe following is my email address ChloeLavoie2...this is my credit card: 45325465105752805284482559079650200036
2MylesClarkMylesClark300@hotmail.comthis is my credit card: 45396509396552905338287181016540300123
3DanielMurrayDanielMurray400@outlook.cathis is my credit card: 47165051601134705581255820397210400028
4​LucyLandry​LucyLandry500@ubc.cathis is my credit card: 47169084003715505453813871212040500137
\n", "
" ], "text/plain": [ " first_name last_name email_address \\\n", "0 Jacob King the following is my email address JacobKing100... \n", "1 Chloe Lavoie the following is my email address ChloeLavoie2... \n", "2 Myles Clark MylesClark300@hotmail.com \n", "3 Daniel Murray DanielMurray400@outlook.ca \n", "4 ​Lucy Landry ​LucyLandry500@ubc.ca \n", "\n", " visa_cc master_cc balance \\\n", "0 this is my credit card: 4658481398602920 5339168719695860 100 \n", "1 this is my credit card: 4532546510575280 5284482559079650 200 \n", "2 this is my credit card: 4539650939655290 5338287181016540 300 \n", "3 this is my credit card: 4716505160113470 5581255820397210 400 \n", "4 this is my credit card: 4716908400371550 5453813871212040 500 \n", "\n", " active_member age \n", "0 1 24 \n", "1 0 36 \n", "2 1 23 \n", "3 0 28 \n", "4 1 37 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_with_pii = pd.read_csv(\"../tests/data_with_pii.csv\")\n", "df_with_pii.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameemail_addressvisa_ccmaster_ccbalanceactive_memberage
0JacobKingthe following is my email address EMAILADDRSthis is my credit card: CREDITCARDCREDITCARD100124
1ChloeLavoiethe following is my email address EMAILADDRSthis is my credit card: CREDITCARDCREDITCARD200036
2MylesClarkEMAILADDRSthis is my credit card: CREDITCARDCREDITCARD300123
3DanielMurrayEMAILADDRSthis is my credit card: CREDITCARDCREDITCARD400028
4​LucyLandry​EMAILADDRSthis is my credit card: CREDITCARDCREDITCARD500137
5AustinCoteEMAILADDRSthis is my credit card: CREDITCARDCREDITCARD600131
6LeoLeblancEMAILADDRSthis is my credit card: CREDITCARDthis is my master card number: CREDITCARD700041
7LukeCoteEMAILADDRSCREDITCARDCREDITCARD800143
8ChloeMartinEMAILADDRSCREDITCARDCREDITCARD900058
9SophiaTaylorEMAILADDRSCREDITCARDCREDITCARD1000167
10SebastianLiEMAILADDRSCREDITCARDCREDITCARD1100025
11TheodoreWalkerEMAILADDRSCREDITCARDCREDITCARD1200129
12GraysonMooreEMAILADDRSCREDITCARDCREDITCARD1300038
13MadelynRossEMAILADDRSCREDITCARDCREDITCARD1400164
14CharlieJohnsonEMAILADDRSCREDITCARDCREDITCARD1500066
15IsaacDavisEMAILADDRSCREDITCARDCREDITCARD1600155
16GraceThomasEMAILADDRSCREDITCARDCREDITCARD1700043
17KaydenThomasEMAILADDRSCREDITCARDCREDITCARD1800148
18PeytonBergeronEMAILADDRSCREDITCARDCREDITCARD1900058
19EvelynJohnstonEMAILADDRSCREDITCARDCREDITCARD2000029
\n", "
" ], "text/plain": [ " first_name last_name email_address \\\n", "0 Jacob King the following is my email address EMAILADDRS \n", "1 Chloe Lavoie the following is my email address EMAILADDRS \n", "2 Myles Clark EMAILADDRS \n", "3 Daniel Murray EMAILADDRS \n", "4 ​Lucy Landry ​EMAILADDRS \n", "5 Austin Cote EMAILADDRS \n", "6 Leo Leblanc EMAILADDRS \n", "7 Luke Cote EMAILADDRS \n", "8 Chloe Martin EMAILADDRS \n", "9 Sophia Taylor EMAILADDRS \n", "10 Sebastian Li EMAILADDRS \n", "11 Theodore Walker EMAILADDRS \n", "12 Grayson Moore EMAILADDRS \n", "13 Madelyn Ross EMAILADDRS \n", "14 Charlie Johnson EMAILADDRS \n", "15 Isaac Davis EMAILADDRS \n", "16 Grace Thomas EMAILADDRS \n", "17 Kayden Thomas EMAILADDRS \n", "18 Peyton Bergeron EMAILADDRS \n", "19 Evelyn Johnston EMAILADDRS \n", "\n", " visa_cc \\\n", "0 this is my credit card: CREDITCARD \n", "1 this is my credit card: CREDITCARD \n", "2 this is my credit card: CREDITCARD \n", "3 this is my credit card: CREDITCARD \n", "4 this is my credit card: CREDITCARD \n", "5 this is my credit card: CREDITCARD \n", "6 this is my credit card: CREDITCARD \n", "7 CREDITCARD \n", "8 CREDITCARD \n", "9 CREDITCARD \n", "10 CREDITCARD \n", "11 CREDITCARD \n", "12 CREDITCARD \n", "13 CREDITCARD \n", "14 CREDITCARD \n", "15 CREDITCARD \n", "16 CREDITCARD \n", "17 CREDITCARD \n", "18 CREDITCARD \n", "19 CREDITCARD \n", "\n", " master_cc balance active_member age \n", "0 CREDITCARD 100 1 24 \n", "1 CREDITCARD 200 0 36 \n", "2 CREDITCARD 300 1 23 \n", "3 CREDITCARD 400 0 28 \n", "4 CREDITCARD 500 1 37 \n", "5 CREDITCARD 600 1 31 \n", "6 this is my master card number: CREDITCARD 700 0 41 \n", "7 CREDITCARD 800 1 43 \n", "8 CREDITCARD 900 0 58 \n", "9 CREDITCARD 1000 1 67 \n", "10 CREDITCARD 1100 0 25 \n", "11 CREDITCARD 1200 1 29 \n", "12 CREDITCARD 1300 0 38 \n", "13 CREDITCARD 1400 1 64 \n", "14 CREDITCARD 1500 0 66 \n", "15 CREDITCARD 1600 1 55 \n", "16 CREDITCARD 1700 0 43 \n", "17 CREDITCARD 1800 1 48 \n", "18 CREDITCARD 1900 0 58 \n", "19 CREDITCARD 2000 0 29 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "c.clean(df_with_pii)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_namebalanceactive_memberage
0JacobKing100124
1ChloeLavoie200036
2MylesClark300123
3DanielMurray400028
4​LucyLandry500137
\n", "
" ], "text/plain": [ " first_name last_name balance active_member age\n", "0 Jacob King 100 1 24\n", "1 Chloe Lavoie 200 0 36\n", "2 Myles Clark 300 1 23\n", "3 Daniel Murray 400 0 28\n", "4 ​Lucy Landry 500 1 37" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_without_pii = pd.read_csv(\"../tests/data_without_pii.csv\")\n", "df_without_pii.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_namebalanceactive_memberage
0JacobKing100124
1ChloeLavoie200036
2MylesClark300123
3DanielMurray400028
4​LucyLandry500137
5AustinCote600131
6LeoLeblanc700041
7LukeCote800143
8ChloeMartin900058
9SophiaTaylor1000167
10SebastianLi1100025
11TheodoreWalker1200129
12GraysonMoore1300038
13MadelynRoss1400164
14CharlieJohnson1500066
15IsaacDavis1600155
16GraceThomas1700043
17KaydenThomas1800148
18PeytonBergeron1900058
19EvelynJohnston2000029
\n", "
" ], "text/plain": [ " first_name last_name balance active_member age\n", "0 Jacob King 100 1 24\n", "1 Chloe Lavoie 200 0 36\n", "2 Myles Clark 300 1 23\n", "3 Daniel Murray 400 0 28\n", "4 ​Lucy Landry 500 1 37\n", "5 Austin Cote 600 1 31\n", "6 Leo Leblanc 700 0 41\n", "7 Luke Cote 800 1 43\n", "8 Chloe Martin 900 0 58\n", "9 Sophia Taylor 1000 1 67\n", "10 Sebastian Li 1100 0 25\n", "11 Theodore Walker 1200 1 29\n", "12 Grayson Moore 1300 0 38\n", "13 Madelyn Ross 1400 1 64\n", "14 Charlie Johnson 1500 0 66\n", "15 Isaac Davis 1600 1 55\n", "16 Grace Thomas 1700 0 43\n", "17 Kayden Thomas 1800 1 48\n", "18 Peyton Bergeron 1900 0 58\n", "19 Evelyn Johnston 2000 0 29" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "c.clean(df_without_pii)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Testing with dummy data (hashing)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# create Cleanser with hash\n", "c = Cleanser(include_default_spotters=False)\n", "s1 = EmailSpotter(\"EMAILS\",True)\n", "s2 = CreditCardSpotter(\"CREDITCARDS\",True)\n", "c.add_spotter(s1)\n", "c.add_spotter(s2)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameemail_addressvisa_ccmaster_ccbalanceactive_memberage
0JacobKingthe following is my email address d3ebf4160b78...this is my credit card: 49bd7b28d230d310f17685...c077ea8331f1357b655546c0c6dd030c100124
1ChloeLavoiethe following is my email address ccc33e19aed8...this is my credit card: c44d39f19fdd5f1b07c2e4...1beee7a164fd2c76e8b4e588131d64c9200036
2MylesClark8a7ab41909ffabb0e5bd3e759e926550this is my credit card: b31abe94871fbe42c4805a...ff05ef02f05938f288220125c64d72e3300123
3DanielMurraye786525bbe5cdbd0e1b0723580c0be35this is my credit card: 1d5378ff80e8dbdf12c3d1...8c33e7d6d9f5b8b3e9f68b11416e9fe3400028
4​LucyLandry​66aa0f296907a96a0a15a64c54a0271cthis is my credit card: f2d2c9b2a8fdfd376aa806...2fe45c134eebaf4476d23fea7642824a500137
5AustinCote1aac2a4671c819e7ba448156eb2a945dthis is my credit card: e0690ff65144d6ff24fc2c...9e3172495cc711edec297ef8a5f095a5600131
6LeoLeblancb575f97818252f3ae9de320f38e6a26bthis is my credit card: bee6a0d7b4c91337eef146...this is my master card number: dfcc41738316c7d...700041
7LukeCote726ff593c99ae2734b61f7ba08cc0339c68e0b958a085134e22585ac92c08e3c6bb66b6d89abbbf0718f66720547ce57800143
8ChloeMartin40b148d282dfa8d609ab18207188463a22dabc63d739da192ef30a2bbcb06e61a8f9b7c5b5e4c4b3f8a4d37d51078cac900058
9SophiaTaylora6c9c2015492ed280d2da2b94f3f37a4444225bea558baa2a4c006f688853d1f6ad1e9a84f5dbd02512391a07c001ed51000167
10SebastianLi026dec7828d784304966748f8aac14e410fc94534572f34b028c61523f4605fed45be3c831abd474467bdd0b5d9e4dd61100025
11TheodoreWalker0c89577b11f335687d51f6769baf809bb28d6de54411d3884d8b24d55e28500452ccea324a0c1a50a266a332836450421200129
12GraysonMoore4e7ca558fbc639c4cb24f8588c118b3dba76eb7fc1be6293e5feba89f5c7639f39c34c75de06effa59da842781653c4a1300038
13MadelynRoss94cdc647d86ef5a8b54d5ff54b4c35b4211acd46154d7a437dcc03b3ce46e5cebdefeaadfd160b9ff82eb33bd2726b1d1400164
14CharlieJohnson3c235fdb2aa343a247c0f51ceda5eabeba97f2ad35d4d9f6fe00ed50e27623279fe8835d0d95deccf7dfd99c02c9c2941500066
15IsaacDavisffd1f67fff8581d26db24b85ce1d479a53309f3853ff954ef7ed621b38501e28e2737b551ebe7a0f4842f3f11bc2aa871600155
16GraceThomas8d54befa39a3f13bea178f38a8fc67de99a0625ae373ff242d7ed9c76930b8368aba9728ea64663867b50a17c10bf7291700043
17KaydenThomase71770b14ccf5aa8587750c5c5318f4a779c725caf15e67c16b59536eaa5b86292610a6913a995c2d9f5e08bfcd6c1051800148
18PeytonBergeronb7299528a41c8f5baf74ecc541b7aa4e060783327b0e977a61614fa2129a732868321411ad37a3dccabe7902620ef7d01900058
19EvelynJohnston95473fc56071e41d16b3b769a07d17ada22af5a670e749c4e8529a840088c37283d71d2e14d5de862d8bcd28c23c54172000029
\n", "
" ], "text/plain": [ " first_name last_name email_address \\\n", "0 Jacob King the following is my email address d3ebf4160b78... \n", "1 Chloe Lavoie the following is my email address ccc33e19aed8... \n", "2 Myles Clark 8a7ab41909ffabb0e5bd3e759e926550 \n", "3 Daniel Murray e786525bbe5cdbd0e1b0723580c0be35 \n", "4 ​Lucy Landry ​66aa0f296907a96a0a15a64c54a0271c \n", "5 Austin Cote 1aac2a4671c819e7ba448156eb2a945d \n", "6 Leo Leblanc b575f97818252f3ae9de320f38e6a26b \n", "7 Luke Cote 726ff593c99ae2734b61f7ba08cc0339 \n", "8 Chloe Martin 40b148d282dfa8d609ab18207188463a \n", "9 Sophia Taylor a6c9c2015492ed280d2da2b94f3f37a4 \n", "10 Sebastian Li 026dec7828d784304966748f8aac14e4 \n", "11 Theodore Walker 0c89577b11f335687d51f6769baf809b \n", "12 Grayson Moore 4e7ca558fbc639c4cb24f8588c118b3d \n", "13 Madelyn Ross 94cdc647d86ef5a8b54d5ff54b4c35b4 \n", "14 Charlie Johnson 3c235fdb2aa343a247c0f51ceda5eabe \n", "15 Isaac Davis ffd1f67fff8581d26db24b85ce1d479a \n", "16 Grace Thomas 8d54befa39a3f13bea178f38a8fc67de \n", "17 Kayden Thomas e71770b14ccf5aa8587750c5c5318f4a \n", "18 Peyton Bergeron b7299528a41c8f5baf74ecc541b7aa4e \n", "19 Evelyn Johnston 95473fc56071e41d16b3b769a07d17ad \n", "\n", " visa_cc \\\n", "0 this is my credit card: 49bd7b28d230d310f17685... \n", "1 this is my credit card: c44d39f19fdd5f1b07c2e4... \n", "2 this is my credit card: b31abe94871fbe42c4805a... \n", "3 this is my credit card: 1d5378ff80e8dbdf12c3d1... \n", "4 this is my credit card: f2d2c9b2a8fdfd376aa806... \n", "5 this is my credit card: e0690ff65144d6ff24fc2c... \n", "6 this is my credit card: bee6a0d7b4c91337eef146... \n", "7 c68e0b958a085134e22585ac92c08e3c \n", "8 22dabc63d739da192ef30a2bbcb06e61 \n", "9 444225bea558baa2a4c006f688853d1f \n", "10 10fc94534572f34b028c61523f4605fe \n", "11 b28d6de54411d3884d8b24d55e285004 \n", "12 ba76eb7fc1be6293e5feba89f5c7639f \n", "13 211acd46154d7a437dcc03b3ce46e5ce \n", "14 ba97f2ad35d4d9f6fe00ed50e2762327 \n", "15 53309f3853ff954ef7ed621b38501e28 \n", "16 99a0625ae373ff242d7ed9c76930b836 \n", "17 779c725caf15e67c16b59536eaa5b862 \n", "18 060783327b0e977a61614fa2129a7328 \n", "19 a22af5a670e749c4e8529a840088c372 \n", "\n", " master_cc balance active_member \\\n", "0 c077ea8331f1357b655546c0c6dd030c 100 1 \n", "1 1beee7a164fd2c76e8b4e588131d64c9 200 0 \n", "2 ff05ef02f05938f288220125c64d72e3 300 1 \n", "3 8c33e7d6d9f5b8b3e9f68b11416e9fe3 400 0 \n", "4 2fe45c134eebaf4476d23fea7642824a 500 1 \n", "5 9e3172495cc711edec297ef8a5f095a5 600 1 \n", "6 this is my master card number: dfcc41738316c7d... 700 0 \n", "7 6bb66b6d89abbbf0718f66720547ce57 800 1 \n", "8 a8f9b7c5b5e4c4b3f8a4d37d51078cac 900 0 \n", "9 6ad1e9a84f5dbd02512391a07c001ed5 1000 1 \n", "10 d45be3c831abd474467bdd0b5d9e4dd6 1100 0 \n", "11 52ccea324a0c1a50a266a33283645042 1200 1 \n", "12 39c34c75de06effa59da842781653c4a 1300 0 \n", "13 bdefeaadfd160b9ff82eb33bd2726b1d 1400 1 \n", "14 9fe8835d0d95deccf7dfd99c02c9c294 1500 0 \n", "15 e2737b551ebe7a0f4842f3f11bc2aa87 1600 1 \n", "16 8aba9728ea64663867b50a17c10bf729 1700 0 \n", "17 92610a6913a995c2d9f5e08bfcd6c105 1800 1 \n", "18 68321411ad37a3dccabe7902620ef7d0 1900 0 \n", "19 83d71d2e14d5de862d8bcd28c23c5417 2000 0 \n", "\n", " age \n", "0 24 \n", "1 36 \n", "2 23 \n", "3 28 \n", "4 37 \n", "5 31 \n", "6 41 \n", "7 43 \n", "8 58 \n", "9 67 \n", "10 25 \n", "11 29 \n", "12 38 \n", "13 64 \n", "14 66 \n", "15 55 \n", "16 43 \n", "17 48 \n", "18 58 \n", "19 29 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "c.clean(df_with_pii)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "sanityze", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.15" }, "vscode": { "interpreter": { "hash": "97f10e6cbfe06dd65b050b0369919d407ad188a125c5838bb04c1d573b768972" } } }, "nbformat": 4, "nbformat_minor": 4 }