xai/examples at master · EthicalML/xai

Name	Name	Last commit message	Last commit date
parent directory ..
.ipynb_checkpoints	.ipynb_checkpoints
XAI Tabular Data Example Usage_files	XAI Tabular Data Example Usage_files
data	data
README.md	README.md
XAI Tabular Data Example Usage.ipynb	XAI Tabular Data Example Usage.ipynb

import sys, os
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline

# Use below for charts in dark jupyter theme

THEME_DARK = False

if THEME_DARK:
    # This is used if Jupyter Theme dark is enabled. 
    # The theme chosen can be activated with jupyter theme as follows:
    # >>> jt -t oceans16 -T -nfs 115 -cellw 98% -N  -kl -ofs 11 -altmd
    font_size = '20.0'
    dark_theme_config = {
        "ytick.color" : "w",
        "xtick.color" : "w",
        "text.color": "white",
        'font.size': font_size,
        'axes.titlesize': font_size,
        'axes.labelsize': font_size, 
        'xtick.labelsize': font_size, 
        'ytick.labelsize': font_size, 
        'legend.fontsize': font_size, 
        'figure.titlesize': font_size,
        'figure.figsize': [20, 7],
        'figure.facecolor': "#384151",
        'legend.facecolor': "#384151",
        "axes.labelcolor" : "w",
        "axes.edgecolor" : "w"
    }
    plt.rcParams.update(dark_theme_config)

sys.path.append("..")

import xai
import xai.data

csv_path = 'data/adult.data'
categorical_cols = ["gender", "workclass", "education", "education-num", "marital-status",
                   "occupation", "relationship", "ethnicity", "loan"]
csv_columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
                   "occupation", "relationship", "ethnicity", "gender", "capital-gain", "capital-loss",
                   "hours-per-week", "loan"]

df = xai.data.load_census()
df.tail()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	age	workclass	education	education-num	marital-status	occupation	relationship	ethnicity	gender	capital-gain	hours-per-week	loan
32556	27	Private	Assoc-acdm	12	Married-civ-spouse	Tech-support	Wife	White	Female	0	38	<=50K
32557	40	Private	HS-grad	9	Married-civ-spouse	Machine-op-inspct	Husband	White	Male	0	40	>50K
32558	58	Private	HS-grad	9	Widowed	Adm-clerical	Unmarried	White	Female	0	40	<=50K
32559	22	Private	HS-grad	9	Never-married	Adm-clerical	Own-child	White	Male	0	20	<=50K
32560	52	Self-emp-inc	HS-grad	9	Married-civ-spouse	Exec-managerial	Wife	White	Female	15024	40	>50K

target = "loan"
protected = ["ethnicity", "gender", "age"]

df_groups = xai.imbalance_plot(df, "gender", categorical_cols=categorical_cols)

groups = xai.imbalance_plot(df, "gender", "loan", categorical_cols=categorical_cols)

bal_df = xai.balance(df, "gender", "loan", upsample=0.8, categorical_cols=categorical_cols)

groups = xai.group_by_columns(df, ["gender", "loan"], categorical_cols=categorical_cols)
for group, group_df in groups:
    print(group)
    print(group_df["loan"].head(), "\n")

(' Female', ' <=50K')
4      <=50K
5      <=50K
6      <=50K
12     <=50K
21     <=50K
Name: loan, dtype: object 

(' Female', ' >50K')
8      >50K
19     >50K
52     >50K
67     >50K
84     >50K
Name: loan, dtype: object 

(' Male', ' <=50K')
0      <=50K
1      <=50K
2      <=50K
3      <=50K
13     <=50K
Name: loan, dtype: object 

(' Male', ' >50K')
7      >50K
9      >50K
10     >50K
11     >50K
14     >50K
Name: loan, dtype: object

_ = xai.correlations(df, include_categorical=True, plot_type="matrix")

_ = xai.correlations(df, include_categorical=True)

proc_df = xai.normalize_numeric(bal_df)
proc_df = xai.convert_categories(proc_df)
x = proc_df.drop("loan", axis=1)
y = proc_df["loan"]

x_train, y_train, x_test, y_test, train_idx, test_idx = \
    xai.balanced_train_test_split(
            x, y, "gender", 
            min_per_group=300,
            max_per_group=300,
            categorical_cols=categorical_cols)

x_train_display = bal_df[train_idx]
x_test_display = bal_df[test_idx]

print("Total number of examples: ", x_test.shape[0])

df_test = x_test_display.copy()
df_test["loan"] = y_test

_= xai.imbalance_plot(df_test, "gender", "loan", categorical_cols=categorical_cols)

Total number of examples:  1200

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error, roc_curve, auc

from tensorflow.keras.layers import Input, Dense, Flatten, \
    Concatenate, concatenate, Dropout, Lambda, Embedding
from tensorflow.keras.models import Model, Sequential

def build_model(X):
    input_els = []
    encoded_els = []
    dtypes = list(zip(X.dtypes.index, map(str, X.dtypes)))
    for k,dtype in dtypes:
        input_els.append(Input(shape=(1,)))
        if dtype == "int8":
            e = Flatten()(Embedding(X[k].max()+1, 1)(input_els[-1]))
        else:
            e = input_els[-1]
        encoded_els.append(e)
    encoded_els = concatenate(encoded_els)

    layer1 = Dropout(0.5)(Dense(100, activation="relu")(encoded_els))
    out = Dense(1, activation='sigmoid')(layer1)

    # train model
    model = Model(inputs=input_els, outputs=[out])
    model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])
    return model


def f_in(X, m=None):
    """Preprocess input so it can be provided to a function"""
    if m:
        return [X.iloc[:m,i] for i in range(X.shape[1])]
    else:
        return [X.iloc[:,i] for i in range(X.shape[1])]

def f_out(probs, threshold=0.5):
    """Convert probabilities into classes"""
    return list((probs >= threshold).astype(int).T[0])

model = build_model(x_train)

model.fit(f_in(x_train), y_train, epochs=50, batch_size=512)

Epoch 1/50
99/99 [==============================] - 1s 3ms/step - loss: 0.6227 - accuracy: 0.6459
Epoch 2/50
99/99 [==============================] - 0s 3ms/step - loss: 0.4600 - accuracy: 0.7812
Epoch 3/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3968 - accuracy: 0.8153
Epoch 4/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3789 - accuracy: 0.8215
Epoch 5/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3751 - accuracy: 0.8237
Epoch 6/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3771 - accuracy: 0.8235
Epoch 7/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3730 - accuracy: 0.8254
Epoch 8/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3675 - accuracy: 0.8312
Epoch 9/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3685 - accuracy: 0.8281
Epoch 10/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3620 - accuracy: 0.8313
Epoch 11/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3687 - accuracy: 0.8297
Epoch 12/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3698 - accuracy: 0.8292
Epoch 13/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3666 - accuracy: 0.8285
Epoch 14/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3649 - accuracy: 0.8305
Epoch 15/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3628 - accuracy: 0.8326
Epoch 16/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3669 - accuracy: 0.8306
Epoch 17/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3587 - accuracy: 0.8347
Epoch 18/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3639 - accuracy: 0.8306
Epoch 19/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3618 - accuracy: 0.8335
Epoch 20/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3628 - accuracy: 0.8315
Epoch 21/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3641 - accuracy: 0.8325
Epoch 22/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3634 - accuracy: 0.8310
Epoch 23/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3626 - accuracy: 0.8293
Epoch 24/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3659 - accuracy: 0.8298
Epoch 25/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3607 - accuracy: 0.8333
Epoch 26/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3600 - accuracy: 0.8321
Epoch 27/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3650 - accuracy: 0.8296
Epoch 28/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3626 - accuracy: 0.8317
Epoch 29/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3654 - accuracy: 0.8310
Epoch 30/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3659 - accuracy: 0.8322
Epoch 31/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3716 - accuracy: 0.8278
Epoch 32/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3631 - accuracy: 0.8326
Epoch 33/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3669 - accuracy: 0.8312
Epoch 34/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3604 - accuracy: 0.8325
Epoch 35/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3625 - accuracy: 0.8318
Epoch 36/50
99/99 [==============================] - 0s 2ms/step - loss: 0.3605 - accuracy: 0.8326
Epoch 37/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3595 - accuracy: 0.8334
Epoch 38/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3653 - accuracy: 0.8316
Epoch 39/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3591 - accuracy: 0.8350
Epoch 40/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3602 - accuracy: 0.8337
Epoch 41/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3617 - accuracy: 0.8316
Epoch 42/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3624 - accuracy: 0.8320
Epoch 43/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3624 - accuracy: 0.8328
Epoch 44/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3602 - accuracy: 0.8326
Epoch 45/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3610 - accuracy: 0.8337
Epoch 46/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3626 - accuracy: 0.8323
Epoch 47/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3634 - accuracy: 0.8326
Epoch 48/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3614 - accuracy: 0.8328
Epoch 49/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3610 - accuracy: 0.8332
Epoch 50/50
99/99 [==============================] - 0s 3ms/step - loss: 0.3590 - accuracy: 0.8332





<tensorflow.python.keras.callbacks.History at 0x7f1ee46da710>

score = model.evaluate(f_in(x_test), y_test, verbose=1)
print("Error %.4f: " % score[0])
print("Accuracy %.4f: " % (score[1]*100))

38/38 [==============================] - 0s 1ms/step - loss: 0.3630 - accuracy: 0.8292
Error 0.3630: 
Accuracy 82.9167:

probabilities = model.predict(f_in(x_test))
pred = f_out(probabilities)

_= xai.metrics_plot(
        y_test, 
        probabilities)

df.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	age	workclass	education	education-num	marital-status	occupation	relationship	ethnicity	gender	capital-gain	hours-per-week	loan
0	39	State-gov	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	<=50K
1	50	Self-emp-not-inc	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	<=50K
2	38	Private	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	<=50K
3	53	Private	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	<=50K
4	28	Private	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	<=50K

_ = xai.metrics_plot(
    y_test, 
    probabilities, 
    df=x_test_display, 
    cross_cols=["gender", "ethnicity"],
    categorical_cols=categorical_cols)

_ = [xai.metrics_plot(
    y_test, 
    probabilities, 
    df=x_test_display, 
    cross_cols=[p],
    categorical_cols=categorical_cols) for p in protected]

xai.confusion_matrix_plot(y_test, pred)

xai.confusion_matrix_plot(y_test, pred, scaled=False)

_ = xai.roc_plot(y_test, probabilities)

_ = [xai.roc_plot(
    y_test, 
    probabilities, 
    df=x_test_display, 
    cross_cols=[p],
    categorical_cols=categorical_cols) for p in protected]

_= xai.pr_plot(y_test, probabilities)

_ = [xai.pr_plot(
    y_test, 
    probabilities, 
    df=x_test_display, 
    cross_cols=[p],
    categorical_cols=categorical_cols) for p in protected]

d = xai.smile_imbalance(
    y_test, 
    probabilities)

/home/alejandro/miniconda3/lib/python3.7/site-packages/pandas/core/indexing.py:671: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
WARNING:root:No categorical_cols passed so inferred using np.object, np.int8 and np.bool: Index(['target', 'manual-review'], dtype='object'). If you see an error these are not correct, please provide them as a string array as: categorical_cols=['col1', 'col2', ...]

d[["correct", "incorrect"]].sum().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x7f1effc41fd0>

d = xai.smile_imbalance(
    y_test, 
    probabilities,
    threshold=0.75,
    display_breakdown=True)

WARNING:root:No categorical_cols passed so inferred using np.object, np.int8 and np.bool: Index(['target', 'manual-review'], dtype='object'). If you see an error these are not correct, please provide them as a string array as: categorical_cols=['col1', 'col2', ...]

display_bars = ["true-positives", "true-negatives", 
                "false-positives", "false-negatives"]
d[display_bars].sum().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x7f1effecd990>

d = xai.smile_imbalance(
    y_test, 
    probabilities,
    bins=9,
    threshold=0.75,
    manual_review=0.00001,
    display_breakdown=False)

WARNING:root:No categorical_cols passed so inferred using np.object, np.int8 and np.bool: Index(['target', 'manual-review'], dtype='object'). If you see an error these are not correct, please provide them as a string array as: categorical_cols=['col1', 'col2', ...]

d[["correct", "incorrect", "manual-review"]].sum().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x7f1ec01c9850>

def get_avg(x, y):
    return model.evaluate(f_in(x), y, verbose=0)[1]

imp = xai.feature_importance(x_test, y_test, get_avg)

imp.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	age	workclass	education	education-num	marital-status	occupation	relationship	ethnicity	gender	capital-gain	capital-loss	hours-per-week
0	0.01825	0.002167	0.000833	0.046	0.065667	0.019083	0.02425	0.00275	0.000833	0.05075	0.007833	0.014417

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

examples

examples

README.md

Files

examples

Directory actions

More options

Directory actions

More options

Latest commit

History

examples

Folders and files

parent directory

README.md