Custom models and datasets

In the following sections we provide examples of using LUX with custom models and datasets.

The example shows how to handle categorical variables in LUX and how to add PyTorch model with custom input transformation. The full working example can be found here: Google Colab example

You can add categorical variables to lux, by passing categorical parameter to fit function. The categorical should be alist of boolean values, having True in places that are considered categorical.

from lux.lux import LUX
## NOTE: RESULTS WILL BE DIFFERENT DEPENDING ON THE SAMPLE THAT IS SAMPLE
explain_instance = X_train.sample(1).values
categorical_columns = X_train.select_dtypes(include=['object']).columns
features = df.columns[:-1]
categorical_indicator = [col in categorical_columns for col in features]

#train lux on neighbourhood equal 20 instances
lux = LUX(predict_proba = model.predict_proba,
    neighborhood_size=50,max_depth=2,
    node_size_limit = 1,
    grow_confidence_threshold = 0 )
lux.fit(X_train, y_train,
instance_to_explain=explain_instance,
# categorical indicator
categorical=categorical_indicator)

Below, the example of how to use custom, more complex model is provided (e.g. Deep Neural Network). The only requirement is that the custom model implements predict and predict_proba functions.

from lux.lux import LUX
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
import numpy as np
import pandas as pd


class SimpleNN(nn.Module):
    def __init__(self,input_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()


    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

    def fit(self,X,y):
      # Convert data to tensors
      X_train_tensor = torch.tensor(X.todense(), dtype=torch.float32)
      y_train_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)

      # Loss and optimizer
      criterion = nn.BCELoss()
      optimizer = optim.Adam(self.parameters(), lr=0.001)

      # Training loop
      for epoch in range(100):
          optimizer.zero_grad()
          outputs = self(X_train_tensor)
          loss = criterion(outputs, y_train_tensor)
          loss.backward()
          optimizer.step()

          if (epoch + 1) % 10 == 0:
              print(f'Epoch [{epoch + 1}/100], Loss: {loss.item():.4f}')

    def predict_proba(self, X):
        # Ensure input is dense (convert if sparse)
        if hasattr(X, "toarray"):  # This checks if the input is a sparse matrix (like from OneHotEncoder)
            X = X.toarray()  # Convert sparse matrix to dense

        # Convert to tensor if necessary
        X_tensor = torch.tensor(X, dtype=torch.float32)

        # Perform the forward pass to get predictions
        with torch.no_grad():
            outputs = self(X_tensor)

        # Convert to probabilities (binary classification)
        probabilities = outputs.numpy()  # Convert to numpy array
        return np.column_stack([1 - probabilities, probabilities])  # For binary classification

    def predict(self, X):
        # Ensure input is dense (convert if sparse)
        if hasattr(X, "toarray"):  # This checks if the input is a sparse matrix (like from OneHotEncoder)
            X = X.toarray()  # Convert sparse matrix to dense

        # Convert to tensor if necessary
        X_tensor = torch.tensor(X, dtype=torch.float32)

        # Perform the forward pass to get predictions
        with torch.no_grad():
            outputs = self(X_tensor)

        # Classify based on the output probability (threshold of 0.5)
        predictions = (outputs >= 0.5).float()  # Binary classification: 0 or 1
        return predictions.numpy()  # Convert to numpy array

Once done, the custom model can be wrapped with custom data transformer, to make the whole classiferi one single blackbox to LUX:

class CategoricalWrapper:
    def __init__(self, model_creator,  model_params=None, ohe_encoder=None, categorical_indicator=None, features=None, categories='auto', normalize=False):
        from sklearn.compose import ColumnTransformer
        from sklearn.preprocessing import OneHotEncoder,StandardScaler

        # OneHotEncoder for categorical columns
        if ohe_encoder is None:
            self.ohe_encoder = OneHotEncoder(categories=categories)
        else:
            self.ohe_encoder = ohe_encoder

        # Store parameters
        self.features = features
        self.categories = categories
        self.categorical_indicator = categorical_indicator
        self.model_params = model_params
        self.model_creator = model_creator

        # Add StandardScaler for non-categorical features if normalize=True
        transformers = [
            ("categorical", self.ohe_encoder, [f for f, c in zip(features, categorical_indicator) if c])
        ]

        # If normalize is True, add StandardScaler for non-categorical columns
        if normalize:
            non_categorical_columns = [f for f, c in zip(features, categorical_indicator) if not c]
            transformers.append(("scaler", StandardScaler(), non_categorical_columns))

        # Create the ColumnTransformer
        self.ct = ColumnTransformer(
            transformers,
            remainder='passthrough'
        )

        self.model_params = model_params
        self.model_creator = model_creator


    def fit(self, X, y):
        X_tr = self.ct.fit_transform(X)

        if self.model_params is None:
            model_params = {}
        elif self.model_params=='input_size':
            model_params = {'input_size':X_tr.shape[1]}

        # Create the model by passing parameters to the model_creator lambda
        self.model = self.model_creator(**model_params)

        self.model.fit(X_tr, y)
        return self


    def predict(self, X):
        if type(X) is np.ndarray and self.features is not None:
            X = pd.DataFrame(X, columns=self.features)
        return self.model.predict(self.ct.transform(X))

    def predict_proba(self, X):
        if type(X) is np.ndarray and self.features is not None:
            X = pd.DataFrame(X, columns=self.features)

        X = self.ct.transform(X)
        if hasattr(self.model, 'predict_proba'):
            return self.model.predict_proba(X)
        elif hasattr(self.model, 'decision_function'):
            # Sigmoid transformation for decision_function output
            decision_scores = self.model.decision_function(X)
            probabilities = 1 / (1 + np.exp(-decision_scores))
            return np.column_stack([1 - probabilities, probabilities])
        else:
            return np.array([self.model.predict(X)==c for c in self.model.classes_]).T

    def score(self,X,y):
        return self.model.score(self.ct.transform(X),y)

Finally, the whole can be run in a unified way:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import xgboost as xgb


# Define the URL of the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

# Define the column names (based on the dataset documentation)
column_names = [
    "age", "workclass", "fnlwgt", "education", "education_num", "marital_status",
    "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss",
    "hours_per_week", "native_country", "income"
]

# Download the dataset into a Pandas DataFrame
df = pd.read_csv(url, header=None, names=column_names, na_values=" ?", skipinitialspace=True)

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nSample Data:")
print(df.head())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Preprocess the dataset (e.g., encoding categorical variables, handling missing values)
df = df.dropna()
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_columns = categorical_columns[categorical_columns != 'income']
features = df.columns[:-1]
categorical_indicator = [col in categorical_columns for col in features]

le = LabelEncoder()
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])
df['income'] = le.fit_transform(df['income'])

# Split the data into features and target
target = 'income'
X = df.drop(columns=[target])
y = df[target]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose and train a model
model_type = 'deep_learning'

model_creators = {
    'random_forest': lambda: RandomForestClassifier(),
    'svm': lambda: SVC(probability=True),
    'logistic_regression': lambda: LogisticRegression(),
    'mlp': lambda: MLPClassifier(),
    'deep_learning': lambda input_size: SimpleNN(input_size=input_size)  # Lambda with parameter
}

if model_type == 'xgb':
    # Use XGBoost with categorical support enabled
    model = xgb.XGBClassifier(enable_categorical=True)
    model.fit(X_train, y_train)
elif model_type == 'random_forest':
    model = CategoricalWrapper(model_creators[model_type], categorical_indicator=categorical_indicator, features=features)
    model.fit(X_train, y_train)
elif model_type == 'svm':
    model = CategoricalWrapper(model_creators[model_type], categorical_indicator=categorical_indicator, features=features)
    model.fit(X_train, y_train)
elif model_type == 'logistic_regression':
    model = CategoricalWrapper(model_creators[model_type], categorical_indicator=categorical_indicator, features=features)
    model.fit(X_train, y_train)
elif model_type == 'mlp':
    model = CategoricalWrapper(model_creators[model_type], categorical_indicator=categorical_indicator, features=features)
    model.fit(X_train, y_train)
elif model_type == 'deep_learning':
    # Define a simple neural network with PyTorch

    # Wrap the trained model in CategoricalWrapper
    model = CategoricalWrapper(
        model_creator=model_creators[model_type],
        model_params='input_size',
        features=X_train.columns,
        normalize=True,
        categorical_indicator=categorical_indicator
    )
    model.fit(X_train, y_train)
else:
    print("Invalid model type selected.")

# If not deep learning, evaluate the model
if model_type != 'deep_learning':
    accuracy = model.score(X_test, y_test)
    print(f"\nModel Accuracy: {accuracy:.2f}")