import sys, os
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
# Use below for charts in dark jupyter theme
THEME_DARK = False
if THEME_DARK:
# This is used if Jupyter Theme dark is enabled.
# The theme chosen can be activated with jupyter theme as follows:
# >>> jt -t oceans16 -T -nfs 115 -cellw 98% -N -kl -ofs 11 -altmd
font_size = '20.0'
dark_theme_config = {
"ytick.color" : "w",
"xtick.color" : "w",
"text.color": "white",
'font.size': font_size,
'axes.titlesize': font_size,
'axes.labelsize': font_size,
'xtick.labelsize': font_size,
'ytick.labelsize': font_size,
'legend.fontsize': font_size,
'figure.titlesize': font_size,
'figure.figsize': [20, 7],
'figure.facecolor': "#384151",
'legend.facecolor': "#384151",
"axes.labelcolor" : "w",
"axes.edgecolor" : "w"
}
plt.rcParams.update(dark_theme_config)
sys.path.append("..")
import xai
import xai.data
csv_path = 'data/adult.data'
categorical_cols = ["gender", "workclass", "education", "education-num", "marital-status",
"occupation", "relationship", "ethnicity", "loan"]
csv_columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
"occupation", "relationship", "ethnicity", "gender", "capital-gain", "capital-loss",
"hours-per-week", "loan"]
df = xai.data.load_census()
df.tail()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
age |
workclass |
education |
education-num |
marital-status |
occupation |
relationship |
ethnicity |
gender |
capital-gain |
capital-loss |
hours-per-week |
loan |
32556 |
27 |
Private |
Assoc-acdm |
12 |
Married-civ-spouse |
Tech-support |
Wife |
White |
Female |
0 |
0 |
38 |
<=50K |
32557 |
40 |
Private |
HS-grad |
9 |
Married-civ-spouse |
Machine-op-inspct |
Husband |
White |
Male |
0 |
0 |
40 |
>50K |
32558 |
58 |
Private |
HS-grad |
9 |
Widowed |
Adm-clerical |
Unmarried |
White |
Female |
0 |
0 |
40 |
<=50K |
32559 |
22 |
Private |
HS-grad |
9 |
Never-married |
Adm-clerical |
Own-child |
White |
Male |
0 |
0 |
20 |
<=50K |
32560 |
52 |
Self-emp-inc |
HS-grad |
9 |
Married-civ-spouse |
Exec-managerial |
Wife |
White |
Female |
15024 |
0 |
40 |
>50K |
target = "loan"
protected = ["ethnicity", "gender", "age"]
df_groups = xai.imbalance_plot(df, "gender", categorical_cols=categorical_cols)

groups = xai.imbalance_plot(df, "gender", "loan", categorical_cols=categorical_cols)

bal_df = xai.balance(df, "gender", "loan", upsample=0.8, categorical_cols=categorical_cols)

groups = xai.group_by_columns(df, ["gender", "loan"], categorical_cols=categorical_cols)
for group, group_df in groups:
print(group)
print(group_df["loan"].head(), "\n")
(' Female', ' <=50K')
4 <=50K
5 <=50K
6 <=50K
12 <=50K
21 <=50K
Name: loan, dtype: object
(' Female', ' >50K')
8 >50K
19 >50K
52 >50K
67 >50K
84 >50K
Name: loan, dtype: object
(' Male', ' <=50K')
0 <=50K
1 <=50K
2 <=50K
3 <=50K
13 <=50K
Name: loan, dtype: object
(' Male', ' >50K')
7 >50K
9 >50K
10 >50K
11 >50K
14 >50K
Name: loan, dtype: object
_ = xai.correlations(df, include_categorical=True, plot_type="matrix")

_ = xai.correlations(df, include_categorical=True)

proc_df = xai.normalize_numeric(bal_df)
proc_df = xai.convert_categories(proc_df)
x = proc_df.drop("loan", axis=1)
y = proc_df["loan"]
x_train, y_train, x_test, y_test, train_idx, test_idx = \
xai.balanced_train_test_split(
x, y, "gender",
min_per_group=300,
max_per_group=300,
categorical_cols=categorical_cols)
x_train_display = bal_df[train_idx]
x_test_display = bal_df[test_idx]
print("Total number of examples: ", x_test.shape[0])
df_test = x_test_display.copy()
df_test["loan"] = y_test
_= xai.imbalance_plot(df_test, "gender", "loan", categorical_cols=categorical_cols)
Total number of examples: 1200

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error, roc_curve, auc
from tensorflow.keras.layers import Input, Dense, Flatten, \
Concatenate, concatenate, Dropout, Lambda, Embedding
from tensorflow.keras.models import Model, Sequential
def build_model(X):
input_els = []
encoded_els = []
dtypes = list(zip(X.dtypes.index, map(str, X.dtypes)))
for k,dtype in dtypes:
input_els.append(Input(shape=(1,)))
if dtype == "int8":
e = Flatten()(Embedding(X[k].max()+1, 1)(input_els[-1]))
else:
e = input_els[-1]
encoded_els.append(e)
encoded_els = concatenate(encoded_els)
layer1 = Dropout(0.5)(Dense(100, activation="relu")(encoded_els))
out = Dense(1, activation='sigmoid')(layer1)
# train model
model = Model(inputs=input_els, outputs=[out])
model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])
return model
def f_in(X, m=None):
"""Preprocess input so it can be provided to a function"""
if m:
return [X.iloc[:m,i] for i in range(X.shape[1])]
else:
return [X.iloc[:,i] for i in range(X.shape[1])]
def f_out(probs, threshold=0.5):
"""Convert probabilities into classes"""
return list((probs >= threshold).astype(int).T[0])
model = build_model(x_train)
model.fit(f_in(x_train), y_train, epochs=50, batch_size=512)
Epoch 1/50
99/99 [==============================] - 1s 3ms/step - loss: 0.6227 - accuracy: 0.6459
Epoch 2/50
99/99 [==============================] - 0s 3ms/step - loss: 0.4600 - accuracy: 0.7812
Epoch 3/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3968 - accuracy: 0.8153
Epoch 4/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3789 - accuracy: 0.8215
Epoch 5/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3751 - accuracy: 0.8237
Epoch 6/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3771 - accuracy: 0.8235
Epoch 7/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3730 - accuracy: 0.8254
Epoch 8/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3675 - accuracy: 0.8312
Epoch 9/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3685 - accuracy: 0.8281
Epoch 10/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3620 - accuracy: 0.8313
Epoch 11/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3687 - accuracy: 0.8297
Epoch 12/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3698 - accuracy: 0.8292
Epoch 13/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3666 - accuracy: 0.8285
Epoch 14/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3649 - accuracy: 0.8305
Epoch 15/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3628 - accuracy: 0.8326
Epoch 16/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3669 - accuracy: 0.8306
Epoch 17/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3587 - accuracy: 0.8347
Epoch 18/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3639 - accuracy: 0.8306
Epoch 19/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3618 - accuracy: 0.8335
Epoch 20/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3628 - accuracy: 0.8315
Epoch 21/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3641 - accuracy: 0.8325
Epoch 22/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3634 - accuracy: 0.8310
Epoch 23/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3626 - accuracy: 0.8293
Epoch 24/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3659 - accuracy: 0.8298
Epoch 25/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3607 - accuracy: 0.8333
Epoch 26/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3600 - accuracy: 0.8321
Epoch 27/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3650 - accuracy: 0.8296
Epoch 28/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3626 - accuracy: 0.8317
Epoch 29/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3654 - accuracy: 0.8310
Epoch 30/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3659 - accuracy: 0.8322
Epoch 31/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3716 - accuracy: 0.8278
Epoch 32/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3631 - accuracy: 0.8326
Epoch 33/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3669 - accuracy: 0.8312
Epoch 34/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3604 - accuracy: 0.8325
Epoch 35/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3625 - accuracy: 0.8318
Epoch 36/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3605 - accuracy: 0.8326
Epoch 37/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3595 - accuracy: 0.8334
Epoch 38/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3653 - accuracy: 0.8316
Epoch 39/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3591 - accuracy: 0.8350
Epoch 40/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3602 - accuracy: 0.8337
Epoch 41/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3617 - accuracy: 0.8316
Epoch 42/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3624 - accuracy: 0.8320
Epoch 43/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3624 - accuracy: 0.8328
Epoch 44/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3602 - accuracy: 0.8326
Epoch 45/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3610 - accuracy: 0.8337
Epoch 46/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3626 - accuracy: 0.8323
Epoch 47/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3634 - accuracy: 0.8326
Epoch 48/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3614 - accuracy: 0.8328
Epoch 49/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3610 - accuracy: 0.8332
Epoch 50/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3590 - accuracy: 0.8332
<tensorflow.python.keras.callbacks.History at 0x7f1ee46da710>
score = model.evaluate(f_in(x_test), y_test, verbose=1)
print("Error %.4f: " % score[0])
print("Accuracy %.4f: " % (score[1]*100))
38/38 [==============================] - 0s 1ms/step - loss: 0.3630 - accuracy: 0.8292
Error 0.3630:
Accuracy 82.9167:
probabilities = model.predict(f_in(x_test))
pred = f_out(probabilities)
_= xai.metrics_plot(
y_test,
probabilities)

<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
age |
workclass |
education |
education-num |
marital-status |
occupation |
relationship |
ethnicity |
gender |
capital-gain |
capital-loss |
hours-per-week |
loan |
0 |
39 |
State-gov |
Bachelors |
13 |
Never-married |
Adm-clerical |
Not-in-family |
White |
Male |
2174 |
0 |
40 |
<=50K |
1 |
50 |
Self-emp-not-inc |
Bachelors |
13 |
Married-civ-spouse |
Exec-managerial |
Husband |
White |
Male |
0 |
0 |
13 |
<=50K |
2 |
38 |
Private |
HS-grad |
9 |
Divorced |
Handlers-cleaners |
Not-in-family |
White |
Male |
0 |
0 |
40 |
<=50K |
3 |
53 |
Private |
11th |
7 |
Married-civ-spouse |
Handlers-cleaners |
Husband |
Black |
Male |
0 |
0 |
40 |
<=50K |
4 |
28 |
Private |
Bachelors |
13 |
Married-civ-spouse |
Prof-specialty |
Wife |
Black |
Female |
0 |
0 |
40 |
<=50K |
_ = xai.metrics_plot(
y_test,
probabilities,
df=x_test_display,
cross_cols=["gender", "ethnicity"],
categorical_cols=categorical_cols)

_ = [xai.metrics_plot(
y_test,
probabilities,
df=x_test_display,
cross_cols=[p],
categorical_cols=categorical_cols) for p in protected]



xai.confusion_matrix_plot(y_test, pred)

xai.confusion_matrix_plot(y_test, pred, scaled=False)

_ = xai.roc_plot(y_test, probabilities)

_ = [xai.roc_plot(
y_test,
probabilities,
df=x_test_display,
cross_cols=[p],
categorical_cols=categorical_cols) for p in protected]



_= xai.pr_plot(y_test, probabilities)

_ = [xai.pr_plot(
y_test,
probabilities,
df=x_test_display,
cross_cols=[p],
categorical_cols=categorical_cols) for p in protected]



d = xai.smile_imbalance(
y_test,
probabilities)
/home/alejandro/miniconda3/lib/python3.7/site-packages/pandas/core/indexing.py:671: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self._setitem_with_indexer(indexer, value)
WARNING:root:No categorical_cols passed so inferred using np.object, np.int8 and np.bool: Index(['target', 'manual-review'], dtype='object'). If you see an error these are not correct, please provide them as a string array as: categorical_cols=['col1', 'col2', ...]

d[["correct", "incorrect"]].sum().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7f1effc41fd0>

d = xai.smile_imbalance(
y_test,
probabilities,
threshold=0.75,
display_breakdown=True)
WARNING:root:No categorical_cols passed so inferred using np.object, np.int8 and np.bool: Index(['target', 'manual-review'], dtype='object'). If you see an error these are not correct, please provide them as a string array as: categorical_cols=['col1', 'col2', ...]

display_bars = ["true-positives", "true-negatives",
"false-positives", "false-negatives"]
d[display_bars].sum().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7f1effecd990>

d = xai.smile_imbalance(
y_test,
probabilities,
bins=9,
threshold=0.75,
manual_review=0.00001,
display_breakdown=False)
WARNING:root:No categorical_cols passed so inferred using np.object, np.int8 and np.bool: Index(['target', 'manual-review'], dtype='object'). If you see an error these are not correct, please provide them as a string array as: categorical_cols=['col1', 'col2', ...]

d[["correct", "incorrect", "manual-review"]].sum().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7f1ec01c9850>

def get_avg(x, y):
return model.evaluate(f_in(x), y, verbose=0)[1]
imp = xai.feature_importance(x_test, y_test, get_avg)
imp.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
age |
workclass |
education |
education-num |
marital-status |
occupation |
relationship |
ethnicity |
gender |
capital-gain |
capital-loss |
hours-per-week |
0 |
0.01825 |
0.002167 |
0.000833 |
0.046 |
0.065667 |
0.019083 |
0.02425 |
0.00275 |
0.000833 |
0.05075 |
0.007833 |
0.014417 |
