Setup Steps¶
In [1]:
#!pip install pycryptodome
#!pip install pandas
#!pip install anonsdk_dir-1.0.0-py3-none-any.whl
In [2]:
# Surpress warnings
import warnings
warnings.filterwarnings('ignore')
In [3]:
# Import libraries
import time
import warnings
import os
import json
import requests
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import anonsdk as asdk
import datetime
Authentication (Anonymization Engine + API Playground)¶
In [4]:
# Function to authenticate with the Playground
def get_jwt_token(email, password):
"""Logs in to get the JWT token."""
login_url = 'https://api.playground.protegrity.com/auth/login'
login_payload = {
"email": email,
"password": password
}
login_headers = {
'Content-Type': 'application/json'
}
login_response = requests.post(login_url, headers=login_headers, data=json.dumps(login_payload))
if login_response.status_code == 201:
jwt_token = login_response.json().get('jwt_token')
print("Login successful, JWT token obtained.")
return jwt_token
else:
print(f"Failed to login. Status code: {login_response.status_code}")
print(login_response.text)
return None
In [5]:
# API Playground Creds
# REPLACE WITH YOUR OWN
# SIGNUP ON https://protegrity.com/api-playground
email = "your_email"
password = "your_password"
jwt = jwt_token = get_jwt_token(email, password)
api_key = "your_group_api_key"
Login successful, JWT token obtained.
In [6]:
# Connect to Anon service
ann_cluster = asdk.Connection('https://anon.playground.protegrity.com:443', api_key=api_key, jwt=jwt)
In [7]:
# Function to visualize Risk & Utility metrics
def display_risk_utility(riskmetrics, utilitymetrics):
utilitymetrics2 = utilitymetrics.copy(deep=True)
riskmetrics2 = riskmetrics.copy(deep=True)
def convert(x):
return np.round(x * 100, 1)
riskmetrics2[['journalist', 'marketer', 'prosecutor']] = riskmetrics2[['journalist', 'marketer', 'prosecutor']].apply(convert, axis=1)
utilitymetrics2[['Source', 'Result']] = utilitymetrics2[['Source', 'Result']].apply(convert, axis=1)
# Risk metrics
def display_subset(riskmetrics, risk_model):
subset = riskmetrics.copy(deep=True)
subset = subset[[risk_model, 'type']]
subset.rename(columns={'type': 'Protegrity Anonymization'}, inplace=True)
subset = subset.pivot(columns='Protegrity Anonymization')[risk_model]
subset.rename(columns={'Result': 'After', 'Source': 'Before'}, inplace=True)
subset.index.names = ['Metric']
subset = subset[['Before', 'After']]
ax = subset.plot.bar(rot=25, color=["#DADCE2", "#FA5A25"], figsize=(15, 7), title=f"Risk metrics for the {risk_model} model")
for p in ax.patches:
ax.annotate(
f"{p.get_height()}%", (p.get_x()+p.get_width()/2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points'
)
display_subset(riskmetrics=riskmetrics2, risk_model='prosecutor')
display_subset(riskmetrics=riskmetrics2, risk_model='journalist')
display_subset(riskmetrics=riskmetrics2, risk_model='marketer')
# Utility metrics
utilitymetrics2.rename(columns={'Source': 'Before', 'Result': 'After'}, inplace=True)
ax = utilitymetrics2.plot.bar(rot=25, color=["#DADCE2", "#FA5A25"], figsize=(15, 7), title='Utility metrics')
for p in ax.patches:
ax.annotate(
f"{p.get_height()}%", (p.get_x()+p.get_width()/2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points'
)
Data Exploration & Preparation¶
In [8]:
data = pd.read_csv('banking_us_customers.csv',sep=",")
In [9]:
# OPTIONAL
# Explore the dataset metadata
data.dtypes
Out[9]:
Customer_ID object ZIP_Code int64 Age int64 Income int64 Monthly_Spending float64 Credit_Score int64 Loan_Amount float64 Savings_Balance float64 Investment_Value float64 customer_since object gender object marital_status object employment_type object education_level object dtype: object
In [10]:
# OPTIONAL
# View top 10 rows
data.head(10)
Out[10]:
Customer_ID | ZIP_Code | Age | Income | Monthly_Spending | Credit_Score | Loan_Amount | Savings_Balance | Investment_Value | customer_since | gender | marital_status | employment_type | education_level | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | CUST00000 | 97284 | 56 | 97447 | 1456.180871 | 715 | 8596.023209 | 16991.032776 | 34053.283289 | 10-09-2015 | Female | Widowed | Self-employed | High School |
1 | CUST00001 | 1664 | 69 | 56444 | 1576.358950 | 683 | 6827.045391 | 28025.677876 | 34804.048571 | 02-01-2020 | Male | Married | Full-time | High School |
2 | CUST00002 | 59775 | 46 | 465350 | 14352.799740 | 200 | 73477.763400 | 142336.205686 | 95509.483729 | 31-01-2011 | Male | Divorced | Executive | Master's |
3 | CUST00003 | 6536 | 32 | 62499 | 1754.334125 | 635 | 15062.845249 | 27957.538613 | 17484.186124 | 26-05-2013 | Male | Divorced | Full-time | Bachelor's |
4 | CUST00004 | 51719 | 60 | 56117 | 284.143214 | 751 | 9341.763858 | 32537.316031 | 10368.640109 | 14-06-2014 | Female | Married | Full-time | High School |
5 | CUST00005 | 25794 | 25 | 72197 | 1451.345906 | 715 | 102.302454 | 35679.671452 | 52823.423677 | 13-11-2024 | Female | Single | Full-time | Bachelor's |
6 | CUST00006 | 81206 | 78 | 76849 | 745.289043 | 712 | 13889.811264 | 14909.344906 | 13304.496414 | 22-10-2023 | Male | Divorced | Full-time | High School |
7 | CUST00007 | 19501 | 38 | 108230 | 1362.136374 | 624 | 16212.546831 | 17780.910471 | 20704.115922 | 15-02-2007 | Female | Divorced | Full-time | Master's |
8 | CUST00008 | 46171 | 56 | 76390 | 1438.105206 | 765 | 12665.672014 | 39307.516897 | 20134.941670 | 20-02-2025 | Male | Married | Full-time | Master's |
9 | CUST00009 | 81628 | 75 | 73865 | 1988.074179 | 679 | 14592.055102 | 1847.644637 | 2119.144950 | 20-02-2025 | Male | Married | Full-time | High School |
In [11]:
# REQUIRED IF RUNNING DATE GENERALIZATION
# Convert string to datetime
specific_time = datetime.time(12, 0, 0) # 12:00:00
data["customer_since"] = pd.to_datetime(data["customer_since"], format='%d-%m-%Y', errors='coerce')
data["customer_since"] = data['customer_since'].apply(lambda x: datetime.datetime.combine(x, specific_time))
In [12]:
# OPTIONAL
# Explore the categorical variables
gender = data['gender'].unique()
marital_status = data['marital_status'].unique()
education_level = data['education_level'].unique()
employment_type = data['employment_type'].unique()
categories = [gender, marital_status,education_level, employment_type]
categories
Out[12]:
[array(['Female', 'Male'], dtype=object), array(['Widowed', 'Married', 'Divorced', 'Single'], dtype=object), array(['High School', "Master's", "Bachelor's", 'Doctorate'], dtype=object), array(['Self-employed', 'Full-time', 'Executive', 'Student', 'Unemployed', 'Part-time'], dtype=object)]
In [13]:
# OPTIONAL
# Explore the continuous variables
data.describe()
Out[13]:
ZIP_Code | Age | Income | Monthly_Spending | Credit_Score | Loan_Amount | Savings_Balance | Investment_Value | customer_since | |
---|---|---|---|---|---|---|---|---|---|
count | 1000.00000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000 |
mean | 49178.03000 | 49.857000 | 73966.931000 | 1618.234100 | 675.285000 | 16879.161773 | 19921.031345 | 29921.759261 | 2016-11-26 15:25:55.200000256 |
min | 685.00000 | 18.000000 | 10000.000000 | 100.000000 | 200.000000 | 21.865435 | 0.000000 | 0.000000 | 2005-06-16 12:00:00 |
25% | 23011.50000 | 35.000000 | 57484.500000 | 1189.232363 | 646.000000 | 8961.730403 | 7377.817864 | 12241.335343 | 2012-04-21 00:00:00 |
50% | 48257.50000 | 50.000000 | 71833.500000 | 1542.042571 | 680.500000 | 15149.274866 | 18999.775161 | 27708.266064 | 2017-05-27 12:00:00 |
75% | 73587.25000 | 66.000000 | 85736.500000 | 1884.193245 | 713.250000 | 22115.301188 | 28666.410587 | 43242.834499 | 2021-11-25 12:00:00 |
max | 99876.00000 | 79.000000 | 554915.000000 | 19745.557210 | 843.000000 | 247277.439663 | 234170.369146 | 288179.853139 | 2025-06-11 12:00:00 |
std | 28659.40733 | 18.114267 | 38566.963794 | 1175.054243 | 69.301937 | 16385.480169 | 17435.921945 | 25606.270171 | NaN |
Job Configuration¶
In [36]:
# JOB CONFIGURATION
# IMPORTANT – LIMITED VERSION OF THE PRODUCT
# The job configuration supports only 5 attribute transformations.
# More attributes will cause the job to fail
# Connect to the Anonymization Cluster
# Create a Dataset Object
e = asdk.AnonElement(ann_cluster, data, pty_storage=False)
# Set K for the dataset
e.config.k = asdk.K(2)
# Fields to preserve as is
preserved_list = ['Credit_Score']
e.assign(preserved_list, asdk.Preserve())
# Fields to redact
e['Customer_ID'] = asdk.Redact()
# Generalize Age and Weight
#e["Monthly_Spending"] = asdk.Gen_Interval(['1000', '2000'])
#e["Savings_Balance"] = asdk.Gen_Interval(['1000', '10000'])
e["Investment_Value"] = asdk.Gen_Interval(['1000', '10000'])
e["Age"] = asdk.Gen_Interval(['5', '10'], importance = 0.8)
# Employment Tree Gen
employmentTree = {
'lvl0': ['Self-employed', 'Full-time', 'Executive', 'Student', 'Unemployed', 'Part-time'],
'lvl1': ["Self-employed", "Employed", "Employed", "Unemployed", "Unemployed", "Employed"],
'lvl2': ["Employed", "Employed", "Employed", "Unemployed", "Unemployed", "Employed"],
}
e['employment_type'] = asdk.Gen_Tree(pd.DataFrame(data=employmentTree), importance = 0.2)
#e["employment_type"] = asdk.LDiv(lfactor=3)
# Micro-aggregation
e["Income"] = asdk.MicroAgg(asdk.AggregateFunction.Mean, importance=0.2)
# L-Diversity
# Make sure there are at least 2 different sensitive values per record
e["gender"] = asdk.LDiv(lfactor=2)
# Generalize Date
#['H.M.S.ms', 'H.M.S', 'H.M', 'H', 'WD.M.Y', 'W.M.Y', 'FD.M.Y', 'M.Y', 'QTR.Y', 'Y', 'DEC', 'CEN']
#e['customer_since'] = asdk.Gen_Rounding(["M.Y", "Y"], importance=0.2)
# Mask Characters
#e['citizenSince'] = asdk.Gen_Mask(maskchar="*")
# Max suppresion
# I.e. maximum allowed fraction of records removed from the dataset to achieve the set privacy goals
e.config['maxSuppression'] = 0.5
# Explore the dataset and transformation metadata
e.describe()
DataFrame Columns:Index(['Customer_ID', 'ZIP_Code', 'Age', 'Income', 'Monthly_Spending', 'Credit_Score', 'Loan_Amount', 'Savings_Balance', 'Investment_Value', 'customer_since', 'gender', 'marital_status', 'employment_type', 'education_level'], dtype='object') Attribute Config: Customer_ID object Direct Identifier attribute. ZIP_Code int64 Ignored attribute. Age int64 Interval based with levels [['5', '10']]. Having lower bound [18] and upper bound [79] and importance [0.8] Income int64 Micro-Aggregation [Mean] Monthly_Spending float64 Ignored attribute. Credit_Score int64 Non-Sensitive attribute. Loan_Amount float64 Ignored attribute. Savings_Balance float64 Ignored attribute. Investment_Value float64 Interval based with levels [['1000', '10000']]. Having lower bound [0.0] and upper bound [288179.8531389654] and importance [0.5] customer_since datetime64[ns] Ignored attribute. gender object Sensitive data. L-Diversity with l factor of [2] marital_status object Ignored attribute. employment_type object Tree Based Generalization. Missing Hierarchy will be generalized to [[]] and importance [0.2] education_level object Ignored attribute. Config: K factor [2] {'maxSuppression': 0.5, 'redactOutliers': True}
Job Run¶
In [37]:
# JOB RUN
job = asdk.anonymize(e, pty_storage=False)
In [38]:
job.status()
Out[38]:
{'completed': None, 'id': '96786005-2df3-4ee9-b0cf-e77be434166c', 'info': None, 'running': None, 'status': 'InQueue'}
In [39]:
job.result().df.head(10)
Out[39]:
Age | Credit_Score | Investment_Value | gender | employment_type | Customer_ID | Income | |
---|---|---|---|---|---|---|---|
0 | 30 - 39 | 635 | 17000.00 - 17999.99 | Male | Full-time | * | 65470.750000 |
1 | 70 - 79 | 712 | 13000.00 - 13999.99 | Male | Full-time | * | 55205.000000 |
2 | 70 - 79 | 679 | 2000.00 - 2999.99 | Male | Full-time | * | 65445.500000 |
3 | 30 - 39 | 716 | 0.00 - 999.99 | Male | Full-time | * | 73903.333333 |
4 | 40 - 49 | 539 | 3000.00 - 3999.99 | Male | Full-time | * | 65990.000000 |
5 | 70 - 79 | 709 | 28000.00 - 28999.99 | Female | Full-time | * | 59681.200000 |
6 | 50 - 59 | 707 | 27000.00 - 27999.99 | Male | Full-time | * | 73164.666667 |
7 | 40 - 49 | 668 | 0.00 - 999.99 | Female | Full-time | * | 63152.090909 |
8 | 50 - 59 | 610 | 27000.00 - 27999.99 | Male | Full-time | * | 73164.666667 |
9 | 70 - 79 | 666 | 37000.00 - 37999.99 | Female | Full-time | * | 84324.833333 |
Job Results¶
Risk & Utility Measurements (Run #1; k = 2)¶
In [18]:
job.riskStat().iloc[[0,1,3,4]]
Out[18]:
journalist | marketer | prosecutor | type | |
---|---|---|---|---|
avgRecordIdentification | 0.9620 | 0.9620 | 0.9620 | Source |
maxProbabilityIdentification | 1.0000 | 0.0000 | 1.0000 | Source |
avgRecordIdentification | 0.2775 | 0.2775 | 0.2775 | Result |
maxProbabilityIdentification | 0.5000 | 0.0000 | 0.5000 | Result |
In [19]:
display_risk_utility(riskmetrics=job.riskStat().iloc[[0,1,3,4]], utilitymetrics=job.utilityStat())
Risk & Utility Measurements (Run #2; k = 4)¶
In [26]:
job.riskStat().iloc[[0,1,3,4]]
Out[26]:
journalist | marketer | prosecutor | type | |
---|---|---|---|---|
avgRecordIdentification | 0.9620 | 0.9620 | 0.9620 | Source |
maxProbabilityIdentification | 1.0000 | 0.0000 | 1.0000 | Source |
avgRecordIdentification | 0.0664 | 0.0664 | 0.0664 | Result |
maxProbabilityIdentification | 0.2500 | 0.0000 | 0.2500 | Result |
In [27]:
display_risk_utility(riskmetrics=job.riskStat().iloc[[0,1,3,4]], utilitymetrics=job.utilityStat())
Risk & Utility Measurements (Run #3; k = 4; l = 3 [employment])¶
In [32]:
job.riskStat().iloc[[0,1,3,4]]
Out[32]:
journalist | marketer | prosecutor | type | |
---|---|---|---|---|
avgRecordIdentification | 0.9480 | 0.9480 | 0.9480 | Source |
maxProbabilityIdentification | 1.0000 | 0.0000 | 1.0000 | Source |
avgRecordIdentification | 0.0402 | 0.0402 | 0.0402 | Result |
maxProbabilityIdentification | 0.1667 | 0.0000 | 0.1667 | Result |
In [33]:
display_risk_utility(riskmetrics=job.riskStat().iloc[[0,1,3,4]], utilitymetrics=job.utilityStat())
In [ ]:
Last modified July 18, 2025