Get a jump start with the platform by downloading our pre-configured request collections.
AVAILABLE COLLECTIONS
This is the multi-page printable view of this section. Click here to print.
Get a jump start with the platform by downloading our pre-configured request collections.
AVAILABLE COLLECTIONS
#!pip install pycryptodome
#!pip install pandas
#!pip install anonsdk_dir-1.0.0-py3-none-any.whl
# Surpress warnings
import warnings
warnings.filterwarnings('ignore')
# Import libraries
import time
import warnings
import os
import json
import requests
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import anonsdk as asdk
import datetime
# Function to authenticate with the Playground
def get_jwt_token(email, password):
    """Logs in to get the JWT token."""
    login_url = 'https://api.playground.protegrity.com/auth/login'
    login_payload = {
        "email": email,
        "password": password
    }
    login_headers = {
        'Content-Type': 'application/json'
    }
    login_response = requests.post(login_url, headers=login_headers, data=json.dumps(login_payload))
    if login_response.status_code == 201:
        jwt_token = login_response.json().get('jwt_token')
        print("Login successful, JWT token obtained.")
        return jwt_token
    else:
        print(f"Failed to login. Status code: {login_response.status_code}")
        print(login_response.text)
        return None
# API Playground Creds 
# REPLACE WITH YOUR OWN
# SIGNUP ON https://protegrity.com/api-playground
email = "your_email"
password = "your_password"
jwt = jwt_token = get_jwt_token(email, password)
api_key = "your_group_api_key"
Login successful, JWT token obtained.
# Connect to Anon service
ann_cluster = asdk.Connection('https://anon.playground.protegrity.com:443', api_key=api_key, jwt=jwt)
# Function to visualize Risk & Utility metrics
def display_risk_utility(riskmetrics, utilitymetrics):
    utilitymetrics2 = utilitymetrics.copy(deep=True)
    riskmetrics2 = riskmetrics.copy(deep=True)
    def convert(x):
        return np.round(x * 100, 1)
    riskmetrics2[['journalist', 'marketer', 'prosecutor']] = riskmetrics2[['journalist', 'marketer', 'prosecutor']].apply(convert, axis=1)
    
    utilitymetrics2[['Source', 'Result']] = utilitymetrics2[['Source', 'Result']].apply(convert, axis=1)
    # Risk metrics
    def display_subset(riskmetrics, risk_model):
        
        subset = riskmetrics.copy(deep=True)
        subset = subset[[risk_model, 'type']]
        
        subset.rename(columns={'type': 'Protegrity Anonymization'}, inplace=True)
        subset = subset.pivot(columns='Protegrity Anonymization')[risk_model]
        subset.rename(columns={'Result': 'After', 'Source': 'Before'}, inplace=True)
        subset.index.names = ['Metric']
        subset = subset[['Before', 'After']]
        
        ax = subset.plot.bar(rot=25, color=["#DADCE2", "#FA5A25"], figsize=(15, 7), title=f"Risk metrics for the {risk_model} model")
        for p in ax.patches:
            ax.annotate(
                f"{p.get_height()}%", (p.get_x()+p.get_width()/2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points'
            )    
            
    display_subset(riskmetrics=riskmetrics2, risk_model='prosecutor')
    display_subset(riskmetrics=riskmetrics2, risk_model='journalist')
    display_subset(riskmetrics=riskmetrics2, risk_model='marketer')
    # Utility metrics    
    utilitymetrics2.rename(columns={'Source': 'Before', 'Result': 'After'}, inplace=True)
    
    ax = utilitymetrics2.plot.bar(rot=25, color=["#DADCE2", "#FA5A25"], figsize=(15, 7), title='Utility metrics')
    for p in ax.patches:
        ax.annotate(
            f"{p.get_height()}%", (p.get_x()+p.get_width()/2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points'
        )
        
data = pd.read_csv('banking_us_customers.csv',sep=",")
# OPTIONAL
# Explore the dataset metadata
data.dtypes
Customer_ID object ZIP_Code int64 Age int64 Income int64 Monthly_Spending float64 Credit_Score int64 Loan_Amount float64 Savings_Balance float64 Investment_Value float64 customer_since object gender object marital_status object employment_type object education_level object dtype: object
# OPTIONAL
# View top 10 rows
data.head(10)
| Customer_ID | ZIP_Code | Age | Income | Monthly_Spending | Credit_Score | Loan_Amount | Savings_Balance | Investment_Value | customer_since | gender | marital_status | employment_type | education_level | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CUST00000 | 97284 | 56 | 97447 | 1456.180871 | 715 | 8596.023209 | 16991.032776 | 34053.283289 | 10-09-2015 | Female | Widowed | Self-employed | High School | 
| 1 | CUST00001 | 1664 | 69 | 56444 | 1576.358950 | 683 | 6827.045391 | 28025.677876 | 34804.048571 | 02-01-2020 | Male | Married | Full-time | High School | 
| 2 | CUST00002 | 59775 | 46 | 465350 | 14352.799740 | 200 | 73477.763400 | 142336.205686 | 95509.483729 | 31-01-2011 | Male | Divorced | Executive | Master's | 
| 3 | CUST00003 | 6536 | 32 | 62499 | 1754.334125 | 635 | 15062.845249 | 27957.538613 | 17484.186124 | 26-05-2013 | Male | Divorced | Full-time | Bachelor's | 
| 4 | CUST00004 | 51719 | 60 | 56117 | 284.143214 | 751 | 9341.763858 | 32537.316031 | 10368.640109 | 14-06-2014 | Female | Married | Full-time | High School | 
| 5 | CUST00005 | 25794 | 25 | 72197 | 1451.345906 | 715 | 102.302454 | 35679.671452 | 52823.423677 | 13-11-2024 | Female | Single | Full-time | Bachelor's | 
| 6 | CUST00006 | 81206 | 78 | 76849 | 745.289043 | 712 | 13889.811264 | 14909.344906 | 13304.496414 | 22-10-2023 | Male | Divorced | Full-time | High School | 
| 7 | CUST00007 | 19501 | 38 | 108230 | 1362.136374 | 624 | 16212.546831 | 17780.910471 | 20704.115922 | 15-02-2007 | Female | Divorced | Full-time | Master's | 
| 8 | CUST00008 | 46171 | 56 | 76390 | 1438.105206 | 765 | 12665.672014 | 39307.516897 | 20134.941670 | 20-02-2025 | Male | Married | Full-time | Master's | 
| 9 | CUST00009 | 81628 | 75 | 73865 | 1988.074179 | 679 | 14592.055102 | 1847.644637 | 2119.144950 | 20-02-2025 | Male | Married | Full-time | High School | 
# REQUIRED IF RUNNING DATE GENERALIZATION
# Convert string to datetime
specific_time = datetime.time(12, 0, 0) # 12:00:00
data["customer_since"] = pd.to_datetime(data["customer_since"], format='%d-%m-%Y', errors='coerce')
data["customer_since"] = data['customer_since'].apply(lambda x: datetime.datetime.combine(x, specific_time))
# OPTIONAL
# Explore the categorical variables
gender = data['gender'].unique()
marital_status = data['marital_status'].unique()
education_level = data['education_level'].unique()
employment_type = data['employment_type'].unique()
categories = [gender, marital_status,education_level, employment_type]
categories
[array(['Female', 'Male'], dtype=object),
 array(['Widowed', 'Married', 'Divorced', 'Single'], dtype=object),
 array(['High School', "Master's", "Bachelor's", 'Doctorate'], dtype=object),
 array(['Self-employed', 'Full-time', 'Executive', 'Student', 'Unemployed',
        'Part-time'], dtype=object)]
# OPTIONAL
# Explore the continuous variables 
data.describe()
| ZIP_Code | Age | Income | Monthly_Spending | Credit_Score | Loan_Amount | Savings_Balance | Investment_Value | customer_since | |
|---|---|---|---|---|---|---|---|---|---|
| count | 1000.00000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000 | 
| mean | 49178.03000 | 49.857000 | 73966.931000 | 1618.234100 | 675.285000 | 16879.161773 | 19921.031345 | 29921.759261 | 2016-11-26 15:25:55.200000256 | 
| min | 685.00000 | 18.000000 | 10000.000000 | 100.000000 | 200.000000 | 21.865435 | 0.000000 | 0.000000 | 2005-06-16 12:00:00 | 
| 25% | 23011.50000 | 35.000000 | 57484.500000 | 1189.232363 | 646.000000 | 8961.730403 | 7377.817864 | 12241.335343 | 2012-04-21 00:00:00 | 
| 50% | 48257.50000 | 50.000000 | 71833.500000 | 1542.042571 | 680.500000 | 15149.274866 | 18999.775161 | 27708.266064 | 2017-05-27 12:00:00 | 
| 75% | 73587.25000 | 66.000000 | 85736.500000 | 1884.193245 | 713.250000 | 22115.301188 | 28666.410587 | 43242.834499 | 2021-11-25 12:00:00 | 
| max | 99876.00000 | 79.000000 | 554915.000000 | 19745.557210 | 843.000000 | 247277.439663 | 234170.369146 | 288179.853139 | 2025-06-11 12:00:00 | 
| std | 28659.40733 | 18.114267 | 38566.963794 | 1175.054243 | 69.301937 | 16385.480169 | 17435.921945 | 25606.270171 | NaN | 
# JOB CONFIGURATION
# IMPORTANT – LIMITED VERSION OF THE PRODUCT
# The job configuration supports only 5 attribute transformations. 
# More attributes will cause the job to fail
# Connect to the Anonymization Cluster
# Create a Dataset Object
e = asdk.AnonElement(ann_cluster, data, pty_storage=False)
# Set K for the dataset
e.config.k = asdk.K(2)
# Fields to preserve as is
preserved_list = ['Credit_Score']
e.assign(preserved_list, asdk.Preserve())
# Fields to redact
e['Customer_ID'] = asdk.Redact()
# Generalize Age and Weight
#e["Monthly_Spending"] = asdk.Gen_Interval(['1000', '2000'])
#e["Savings_Balance"] = asdk.Gen_Interval(['1000', '10000'])
e["Investment_Value"] = asdk.Gen_Interval(['1000', '10000'])
e["Age"] = asdk.Gen_Interval(['5', '10'], importance = 0.8)
# Employment Tree Gen
employmentTree = {
  'lvl0': ['Self-employed', 'Full-time', 'Executive', 'Student', 'Unemployed', 'Part-time'],
  'lvl1': ["Self-employed", "Employed", "Employed", "Unemployed", "Unemployed", "Employed"],
  'lvl2': ["Employed", "Employed", "Employed", "Unemployed", "Unemployed", "Employed"],
}
e['employment_type'] = asdk.Gen_Tree(pd.DataFrame(data=employmentTree), importance = 0.2)
#e["employment_type"] = asdk.LDiv(lfactor=3)
# Micro-aggregation
e["Income"] = asdk.MicroAgg(asdk.AggregateFunction.Mean, importance=0.2)
# L-Diversity 
# Make sure there are at least 2 different sensitive values per record
e["gender"] = asdk.LDiv(lfactor=2)
# Generalize Date
#['H.M.S.ms', 'H.M.S', 'H.M', 'H', 'WD.M.Y', 'W.M.Y', 'FD.M.Y', 'M.Y', 'QTR.Y', 'Y', 'DEC', 'CEN']
#e['customer_since'] = asdk.Gen_Rounding(["M.Y", "Y"], importance=0.2)
# Mask Characters
#e['citizenSince'] = asdk.Gen_Mask(maskchar="*")
# Max suppresion 
# I.e. maximum allowed fraction of records removed from the dataset to achieve the set privacy goals
e.config['maxSuppression'] = 0.5
# Explore the dataset and transformation metadata
e.describe()
DataFrame Columns:Index(['Customer_ID', 'ZIP_Code', 'Age', 'Income', 'Monthly_Spending',
       'Credit_Score', 'Loan_Amount', 'Savings_Balance', 'Investment_Value',
       'customer_since', 'gender', 'marital_status', 'employment_type',
       'education_level'],
      dtype='object')
Attribute Config:
Customer_ID object Direct Identifier attribute.
ZIP_Code int64 Ignored attribute.
Age int64 Interval based with levels [['5', '10']]. Having lower bound [18] and upper bound [79] and importance [0.8]
Income int64 Micro-Aggregation [Mean]
Monthly_Spending float64 Ignored attribute.
Credit_Score int64 Non-Sensitive attribute.
Loan_Amount float64 Ignored attribute.
Savings_Balance float64 Ignored attribute.
Investment_Value float64 Interval based with levels [['1000', '10000']]. Having lower bound [0.0] and upper bound [288179.8531389654] and importance [0.5]
customer_since datetime64[ns] Ignored attribute.
gender object  Sensitive data. L-Diversity with l factor of [2]
marital_status object Ignored attribute.
employment_type object Tree Based Generalization. Missing Hierarchy will be generalized to [[]] and importance [0.2]
education_level object Ignored attribute.
Config: K factor [2] {'maxSuppression': 0.5, 'redactOutliers': True}
# JOB RUN
job = asdk.anonymize(e, pty_storage=False)
job.status()
{'completed': None,
 'id': '96786005-2df3-4ee9-b0cf-e77be434166c',
 'info': None,
 'running': None,
 'status': 'InQueue'}
job.result().df.head(10)
| Age | Credit_Score | Investment_Value | gender | employment_type | Customer_ID | Income | |
|---|---|---|---|---|---|---|---|
| 0 | 30 - 39 | 635 | 17000.00 - 17999.99 | Male | Full-time | * | 65470.750000 | 
| 1 | 70 - 79 | 712 | 13000.00 - 13999.99 | Male | Full-time | * | 55205.000000 | 
| 2 | 70 - 79 | 679 | 2000.00 - 2999.99 | Male | Full-time | * | 65445.500000 | 
| 3 | 30 - 39 | 716 | 0.00 - 999.99 | Male | Full-time | * | 73903.333333 | 
| 4 | 40 - 49 | 539 | 3000.00 - 3999.99 | Male | Full-time | * | 65990.000000 | 
| 5 | 70 - 79 | 709 | 28000.00 - 28999.99 | Female | Full-time | * | 59681.200000 | 
| 6 | 50 - 59 | 707 | 27000.00 - 27999.99 | Male | Full-time | * | 73164.666667 | 
| 7 | 40 - 49 | 668 | 0.00 - 999.99 | Female | Full-time | * | 63152.090909 | 
| 8 | 50 - 59 | 610 | 27000.00 - 27999.99 | Male | Full-time | * | 73164.666667 | 
| 9 | 70 - 79 | 666 | 37000.00 - 37999.99 | Female | Full-time | * | 84324.833333 | 
job.riskStat().iloc[[0,1,3,4]]
| journalist | marketer | prosecutor | type | |
|---|---|---|---|---|
| avgRecordIdentification | 0.9620 | 0.9620 | 0.9620 | Source | 
| maxProbabilityIdentification | 1.0000 | 0.0000 | 1.0000 | Source | 
| avgRecordIdentification | 0.2775 | 0.2775 | 0.2775 | Result | 
| maxProbabilityIdentification | 0.5000 | 0.0000 | 0.5000 | Result | 
display_risk_utility(riskmetrics=job.riskStat().iloc[[0,1,3,4]], utilitymetrics=job.utilityStat())
job.riskStat().iloc[[0,1,3,4]]
| journalist | marketer | prosecutor | type | |
|---|---|---|---|---|
| avgRecordIdentification | 0.9620 | 0.9620 | 0.9620 | Source | 
| maxProbabilityIdentification | 1.0000 | 0.0000 | 1.0000 | Source | 
| avgRecordIdentification | 0.0664 | 0.0664 | 0.0664 | Result | 
| maxProbabilityIdentification | 0.2500 | 0.0000 | 0.2500 | Result | 
display_risk_utility(riskmetrics=job.riskStat().iloc[[0,1,3,4]], utilitymetrics=job.utilityStat())
job.riskStat().iloc[[0,1,3,4]]
| journalist | marketer | prosecutor | type | |
|---|---|---|---|---|
| avgRecordIdentification | 0.9480 | 0.9480 | 0.9480 | Source | 
| maxProbabilityIdentification | 1.0000 | 0.0000 | 1.0000 | Source | 
| avgRecordIdentification | 0.0402 | 0.0402 | 0.0402 | Result | 
| maxProbabilityIdentification | 0.1667 | 0.0000 | 0.1667 | Result | 
display_risk_utility(riskmetrics=job.riskStat().iloc[[0,1,3,4]], utilitymetrics=job.utilityStat())