import pandas as pd

data = pd.read_csv("bmi-data.csv")

data

data["Obese"] = (data["Index"] >= 4).astype("int")
data = data.drop("Index", axis=1)

data

df_100kg = data[(data["Weight"] >= 100) & (data["Obese"] == 0)]
df_80kg = data[(data["Weight"] >= 80) & (data["Obese"] == 0)]

len(df_100kg), len(df_80kg)

(18, 63)

import numpy as np


def entropy(x):
    assert isinstance(x, pd.Series)
    p = x.value_counts() / len(x)
    # 1e-10 is used to stop np.log2 throwing a division by zero error
    entropy = -np.sum(p * np.log2(p + 1e-10))
    return entropy

entropy((data["Weight"] >= 100) & (data["Obese"] == 0))

0.22364166419594184

entropy((data["Weight"] >= 80) & (data["Obese"] == 0))

0.5463652176690357

def get_information_gain(series, mask):
    n_true_split = sum(mask)
    n_false_split = len(mask) - n_true_split
    if n_true_split == 0 or n_false_split == 0:
        return 0
    original_impurity = entropy(series)
    true_split_impurity = entropy(series[mask])
    false_split_impurity = entropy(series[~mask])
    weighted_average_impurity = (
        n_true_split / len(mask) * true_split_impurity
        + n_false_split / len(mask) * false_split_impurity
    )
    information_gain = original_impurity - weighted_average_impurity
    return information_gain

get_information_gain(data["Obese"], data["Weight"] >= 100)

0.35468741526665915

get_information_gain(data["Obese"], data["Weight"] >= 80)

0.3062635617880082

import itertools


def get_categorical_options(series):
    assert isinstance(series, pd.Series)
    series = set(series)
    options = []
    for i, _ in enumerate(series):
        subset = itertools.combinations(series, i + 1)
        options.extend(subset)
    return options

get_categorical_options(pd.Series(["Red", "Red", "Blue", "Blue", "Green"]))

[('Green',),
 ('Red',),
 ('Blue',),
 ('Green', 'Red'),
 ('Green', 'Blue'),
 ('Red', 'Blue'),
 ('Green', 'Red', 'Blue')]

def get_max_information_gain(x_series, y_series):
    is_numeric = x_series.dtype != "object"
    if is_numeric:
        # Skip the first value as it is the minimum and if we split on it one of the splits will be empty.
        split_values = x_series.sort_values().unique()[1:].tolist()
    else:
        split_values = get_categorical_options(x_series)
    results = []
    if not split_values:
        # Handle the case when all values are the same, e.g. all one value.
        return {"split_value": None, "information_gain": 0, "is_numeric": is_numeric}
    for split_value in split_values:
        mask = x_series < split_value if is_numeric else x_series.isin(split_value)
        split_information_gain = get_information_gain(y_series, mask)
        results.append(
            {
                "value": split_value,
                "information_gain": split_information_gain,
            }
        )
    results = sorted(results, key=lambda x: x["information_gain"], reverse=True)
    best_split_value = results[0]["value"]
    best_information_gain = results[0]["information_gain"]
    return {
        "split_value": best_split_value,
        "information_gain": best_information_gain,
        "is_numeric": is_numeric,
    }

get_max_information_gain(data["Height"], data["Obese"])

{'split_value': 174,
 'information_gain': 0.06474831770089884,
 'is_numeric': True}

import matplotlib.pyplot as plt

values = data["Height"].sort_values().unique()[1:]
information_gains = [
    get_information_gain(data["Height"], data["Height"] < value) for value in values
]

fig, ax = plt.subplots()
ax.plot(values, information_gains)
ax.set_xlabel("Height")
ax.set_ylabel("Information Gain")

Text(0, 0.5, 'Information Gain')

def get_best_split(df, y):
    column_max_information_gains = [
        {"column_name": column, **get_max_information_gain(df[column], df[y])}
        for column in df.columns
        if column != y  # We don't want to split on the target variable
    ]
    best_split_info = sorted(
        column_max_information_gains, key=lambda x: x["information_gain"]
    )[-1]
    return best_split_info

split_info = get_best_split(data, "Obese")

split_info

{'column_name': 'Weight',
 'split_value': 103,
 'information_gain': 0.3824541370911896,
 'is_numeric': True}

def make_split(df, split_info):
    column_name = split_info["column_name"]
    split_value = split_info["split_value"]
    is_numeric = split_info["is_numeric"]
    assert is_numeric == (df[column_name].dtype != "object")
    if is_numeric:
        mask = df[column_name] < split_value
    else:
        mask = df[column_name].isin(split_value)
    df_left = df[mask]
    df_right = df[~mask]
    return df_left, df_right

left, right = make_split(data, split_info)

len(data), len(left), len(right)

(500, 229, 271)

split_info = get_best_split(left, "Obese")

split_info

{'column_name': 'Height',
 'split_value': 178,
 'information_gain': 0.28026630900174687,
 'is_numeric': True}

split_info = get_best_split(right, "Obese")

split_info

{'column_name': 'Weight',
 'split_value': 116,
 'information_gain': 0.09289094500737183,
 'is_numeric': True}

class DecisionNode:
    def __init__(
        self,
        column,
        split_value,
        is_numeric,
        left,
        right,
        is_classification,
        prediction,
    ):
        self.column = column
        self.split_value = split_value
        self.is_numeric = is_numeric
        self.left = left
        self.right = right
        self.is_classification = is_classification
        self.prediction = prediction

    def __repr__(self, depth=0, indent="    "):
        # Print the tree structure
        prefix = depth * indent
        if self.is_leaf():
            return f"{prefix}Leaf(prediction={self.prediction})"
        description = f"{prefix}DecisionNode(column={self.column}, split_value={self.split_value}, is_numeric={self.is_numeric})\n"
        if self.left:
            description += f"{prefix}left:\n{self.left.__repr__(depth + 1, indent)}\n"
        if self.right:
            description += f"{prefix}right:\n{self.right.__repr__(depth + 1, indent)}"
        return description

    def is_leaf(self):
        # If a node has no left and right children, it is a leaf node
        return self.left is None and self.right is None

def build_tree(
    df, y, depth, max_depth, min_samples_split, min_information_gain, is_classification
):

    # Check if we have reached the maximum depth
    if depth >= max_depth:
        return create_leaf_node(df, y, is_classification)

    # Check if we have reached the minimum number of samples required to split
    if len(df) < min_samples_split:
        return create_leaf_node(df, y, is_classification)

    # Get the best split
    split_info = get_best_split(df, y)

    # Check if the best split has enough information gain
    if split_info["information_gain"] < min_information_gain:
        return create_leaf_node(df, y, is_classification)

    # Make the split
    df_left, df_right = make_split(df, split_info)

    # Build the left and right subtrees
    subtree_left = build_tree(
        df_left,
        y,
        depth + 1,
        max_depth,
        min_samples_split,
        min_information_gain,
        is_classification,
    )

    subtree_right = build_tree(
        df_right,
        y,
        depth + 1,
        max_depth,
        min_samples_split,
        min_information_gain,
        is_classification,
    )

    # Make the decision node
    return DecisionNode(
        column=split_info["column_name"],
        split_value=split_info["split_value"],
        is_numeric=split_info["is_numeric"],
        left=subtree_left,
        right=subtree_right,
        is_classification=is_classification,
        prediction=None,
    )


def create_leaf_node(df, y, is_classification):
    # If it is a classification problem, we return the mode of the target variable
    if is_classification:
        prediction = df[y].mode()[0]
    # If it is a regression problem, we return the mean of the target variable
    else:
        prediction = df[y].mean()

    return DecisionNode(
        column=None,
        split_value=None,
        is_numeric=None,
        left=None,
        right=None,
        is_classification=is_classification,
        prediction=prediction,
    )


def train_decision_tree(
    df,
    y,
    max_depth,
    min_samples_split,
    min_information_gain,
    is_classification,
):
    return build_tree(
        df, y, 0, max_depth, min_samples_split, min_information_gain, is_classification
    )

y = "Obese"
max_depth = 10
min_samples_split = 2
min_information_gain = 0.1
is_classification = True

tree = train_decision_tree(
    data,
    y,
    max_depth,
    min_samples_split,
    min_information_gain,
    is_classification,
)

tree

DecisionNode(column=Weight, split_value=103, is_numeric=True)
left:
    DecisionNode(column=Height, split_value=178, is_numeric=True)
    left:
        DecisionNode(column=Weight, split_value=66, is_numeric=True)
        left:
            Leaf(prediction=0)
        right:
            DecisionNode(column=Height, split_value=151, is_numeric=True)
            left:
                DecisionNode(column=Weight, split_value=67, is_numeric=True)
                left:
                    DecisionNode(column=Height, split_value=149, is_numeric=True)
                    left:
                        Leaf(prediction=1)
                    right:
                        Leaf(prediction=0)
                right:
                    Leaf(prediction=1)
            right:
                DecisionNode(column=Weight, split_value=82, is_numeric=True)
                left:
                    DecisionNode(column=Height, split_value=161, is_numeric=True)
                    left:
                        DecisionNode(column=Weight, split_value=74, is_numeric=True)
                        left:
                            Leaf(prediction=0)
                        right:
                            DecisionNode(column=Height, split_value=154, is_numeric=True)
                            left:
                                DecisionNode(column=Weight, split_value=78, is_numeric=True)
                                left:
                                    Leaf(prediction=1)
                                right:
                                    Leaf(prediction=0)
                            right:
                                Leaf(prediction=1)
                    right:
                        Leaf(prediction=0)
                right:
                    DecisionNode(column=Height, split_value=173, is_numeric=True)
                    left:
                        Leaf(prediction=1)
                    right:
                        DecisionNode(column=Weight, split_value=95, is_numeric=True)
                        left:
                            Leaf(prediction=0)
                        right:
                            Leaf(prediction=1)
    right:
        Leaf(prediction=0)
right:
    Leaf(prediction=1)

def _predict_tree(tree, example):
    if tree.is_leaf():
        return tree.prediction
    if tree.is_numeric:
        if example[tree.column] < tree.split_value:
            return _predict_tree(tree.left, example)
        else:
            return _predict_tree(tree.right, example)
    else:
        if example[tree.column] in tree.split_value:
            return _predict_tree(tree.left, example)
        else:
            return _predict_tree(tree.right, example)


def predict_tree(tree, x):
    if isinstance(x, pd.Series):
        return _predict_tree(tree, x)
    else:
        assert isinstance(x, pd.DataFrame)
        return x.apply(lambda row: _predict_tree(tree, row), axis=1)

example = data.iloc[0]

example

Gender    Male
Height     174
Weight      96
Obese        1
Name: 0, dtype: object

predict_tree(tree, example)

1

predictions = predict_tree(tree, data)

predictions

0      1
1      0
2      1
3      1
4      0
      ..
495    1
496    1
497    1
498    1
499    1
Length: 500, dtype: int64

(predictions == data["Obese"]).sum()

487

errors = data[data["Obese"] != predictions]

errors

import tqdm


def train_random_forest(
    data,
    y,
    n_trees=10,
    max_depth=2,
    min_samples_split=2,
    min_information_gain=0.01,
    is_classification=True,
):
    trees = []
    for _ in tqdm.tqdm(range(n_trees)):
        # Bootstrap sample the data
        sample = data.sample(frac=1, replace=True)
        # Train a decision tree on the bootstrapped sample
        tree = train_decision_tree(
            sample,
            y,
            max_depth,
            min_samples_split,
            min_information_gain,
            is_classification,
        )
        trees.append(tree)
    return trees

y = "Obese"
n_trees = 25
max_depth = 10
min_samples_split = 2
min_information_gain = 0.1
is_classification = True

forest = train_random_forest(
    data,
    y,
    n_trees,
    max_depth,
    min_samples_split,
    min_information_gain,
    is_classification,
)

100%|███████████████████████████████████████████| 25/25 [00:33<00:00,  1.32s/it]

def _predict_forest(forest, x):
    predictions = [predict_tree(tree, x) for tree in forest]
    is_classification = forest[0].is_classification
    if is_classification:
        return max(set(predictions), key=predictions.count)
    else:
        return np.mean(predictions)


def predict_forest(forest, x):
    if isinstance(x, pd.Series):
        return _predict_forest(forest, x)
    else:
        assert isinstance(x, pd.DataFrame)
        return x.apply(lambda row: _predict_forest(forest, row), axis=1)

predict_forest(forest, example)

1

(predict_forest(forest, data) == data["Obese"]).sum()

498

	Gender	Height	Weight
3	Female	195	104
5	Male	189	104
36	Female	197	114
82	Female	190	105
92	Female	194	111
137	Male	194	108
146	Male	194	106
149	Female	168	115
184	Female	192	108
291	Male	183	105
296	Female	169	88
400	Female	195	104
469	Male	198	109

Building Decision Trees and Random Forests from scratch¶