import numpy as np


def expectation(X, p):
    return np.sum([x * px for (x, px) in zip(X, p)])


X = [0, 1, 2, 3, 4]  # discrete values
p = [0.4, 0.3, 0.2, 0.05, 0.05]  # probability distribution

print(expectation(X, p))

1.05

X = [0, 1, 2, 3, 4]
p = [0.4, 0.3, 0.2, 0.05, 0.05]

total = 0
N = 100_000

for _ in range(N):
    x = np.random.choice(X, p=p)
    total += x

print(total / N)

1.04918

X = [0, 1, 2, 3, 4]
p = [0.4, 0.3, 0.2, 0.05, 0.05]


def f(x):
    return x**2


fX = [f(x) for x in X]

print(expectation(fX, p))

2.35

X = [0, 1, 2, 3, 4]
p = [0.4, 0.3, 0.2, 0.05, 0.05]

total = 0
N = 100_000

for _ in range(N):
    x = np.random.choice(X, p=p)
    total += f(x)

print(total / N)

2.33089

import matplotlib.pyplot as plt


def self_information(p):
    return -np.log2(p)


ps = np.linspace(0.0001, 1, 1000)

fig, ax = plt.subplots()
ax.plot(ps, self_information(ps))
ax.set_title("p(x) vs I(x)")
ax.set_xlabel("p(x)")
ax.set_ylabel("I(x)")
ax.set_xticks(np.arange(0, 1.05, 0.1))
ax.set_yticks(np.arange(0, 14, 1))
ax.grid()

def entropy(p):
    return np.sum([self_information(px) * px if px != 0 else 0 for px in p])


p = [0.4, 0.3, 0.2, 0.05, 0.05]

print(entropy(p))

1.9464393446710153

p = [0.99, 0.0025, 0.0025, 0.0025, 0.0025]

print(entropy(p))

0.10079313589591118

p = [1, 0, 0, 0, 0]

print(entropy(p))

0.0

p = [0.2, 0.2, 0.2, 0.2, 0.2]

print(entropy(p))

2.321928094887362

def C(x):
    return format(x, "b")


X = [0, 1, 2, 3, 4]

CX = [C(x) for x in X]

print(CX)

['0', '1', '10', '11', '100']

def huffman(p):
    """Return a Huffman code for an ensemble with distribution p."""

    # Base case of only two symbols, assign 0 or 1 arbitrarily
    if len(p) == 2:
        return dict(zip(p.keys(), ["0", "1"]))

    # Create a new distribution by merging lowest prob. pair
    p_prime = p.copy()
    a1, a2 = lowest_prob_pair(p)
    p1, p2 = p_prime.pop(a1), p_prime.pop(a2)
    p_prime[a1 + a2] = p1 + p2

    # Recurse and construct code on new distribution
    c = huffman(p_prime)
    ca1a2 = c.pop(a1 + a2)
    c[a1], c[a2] = ca1a2 + "0", ca1a2 + "1"

    return c


def lowest_prob_pair(p):
    """Return pair of symbols from distribution p with lowest probabilities."""
    assert len(p) >= 2  # Ensure there are at least 2 symbols in the dict.
    sorted_p = [(k, p[k]) for k in sorted(p, key=p.get, reverse=False)]
    return sorted_p[0][0], sorted_p[1][0]

X = {"a": 0.5, "b": 0.25, "c": 0.25}

code = huffman(X)

code

{'a': '0', 'b': '10', 'c': '11'}

X = {0: 0.4, 1: 0.3, 2: 0.2, 3: 0.05, 4: 0.05}  # value: probability

code = huffman(X)

code

{0: '0', 1: '10', 2: '111', 3: '1100', 4: '1101'}

def C(x, code):
    assert x in code.keys()
    return code[x]

X = [0, 1, 2, 3, 4]
length_CX = [len(C(x, code)) for x in X]
p = [0.4, 0.3, 0.2, 0.05, 0.05]

print(f"             H(X) = {entropy(p)}")
print(f"E_x[length(C(X))] = {expectation(length_CX, p)}")
print(f"        H(X) + 1  = {entropy(p) + 1}")

             H(X) = 1.9464393446710153
E_x[length(C(X))] = 2.0
        H(X) + 1  = 2.946439344671015

def cross_entropy(p, q, eps=1e-10):
    return -np.sum(
        [px * np.log2(qx) if qx != 0 else px * np.log2(eps) for (px, qx) in zip(p, q)]
    )


p = [0, 1]

q_1 = [1, 0]
q_2 = [0.7, 0.3]
q_3 = [0.5, 0.5]
q_4 = [0.3, 0.7]
q_5 = [0, 1]

print(f"H(p) = {entropy(p)}")
print(f"H(p, p) = {cross_entropy(p, p)}")
print(f"H(p, q_1) = {cross_entropy(p, q_1)}")
print(f"H(p, q_2) = {cross_entropy(p, q_2)}")
print(f"H(p, q_3) = {cross_entropy(p, q_3)}")
print(f"H(p, q_4) = {cross_entropy(p, q_4)}")
print(f"H(p, q_5) = {cross_entropy(p, q_5)}")

H(p) = 0.0
H(p, p) = -0.0
H(p, q_1) = 33.219280948873624
H(p, q_2) = 1.7369655941662063
H(p, q_3) = 1.0
H(p, q_4) = 0.5145731728297583
H(p, q_5) = -0.0

def kl_div(p, q, eps=1e-10):
    return np.sum(
        [
            px * np.log2(px / (qx if qx != 0 else eps)) if px != 0 else 0
            for (px, qx) in zip(p, q)
        ]
    )


p = [0, 1]

q_1 = [1, 0]
q_2 = [0.7, 0.3]
q_3 = [0.5, 0.5]
q_4 = [0.3, 0.7]
q_5 = [0, 1]

print(f"D_KL(p || q_1) = {kl_div(p, q_1)}")
print(f"D_KL(p || q_2) = {kl_div(p, q_2)}")
print(f"D_KL(p || q_3) = {kl_div(p, q_3)}")
print(f"D_KL(p || q_4) = {kl_div(p, q_4)}")
print(f"D_KL(p || q_5) = {kl_div(p, q_5)}")
print(f"D_KL(q_3 || p) = {kl_div(q_3, p)}")

D_KL(p || q_1) = 33.219280948873624
D_KL(p || q_2) = 1.7369655941662063
D_KL(p || q_3) = 1.0
D_KL(p || q_4) = 0.5145731728297582
D_KL(p || q_5) = 0.0
D_KL(q_3 || p) = 15.609640474436812

p = [0, 1]

q_1 = [1, 0]
q_2 = [0.7, 0.3]
q_3 = [0.5, 0.5]
q_4 = [0.3, 0.7]
q_5 = [0, 1]


print(
    f"D_KL(p || q_1) = H(p, q_1) - H(p)? {np.isclose(kl_div(p, q_1), cross_entropy(p, q_1) - entropy(p))}"
)
print(
    f"D_KL(p || q_2) = H(p, q_2) - H(p)? {np.isclose(kl_div(p, q_2), cross_entropy(p, q_2) - entropy(p))}"
)
print(
    f"D_KL(p || q_3) = H(p, q_3) - H(p)? {np.isclose(kl_div(p, q_3), cross_entropy(p, q_3) - entropy(p))}"
)
print(
    f"D_KL(p || q_4) = H(p, q_4) - H(p)? {np.isclose(kl_div(p, q_4), cross_entropy(p, q_4) - entropy(p))}"
)
print(
    f"D_KL(p || q_5) = H(p, q_5) - H(p)? {np.isclose(kl_div(p, q_5), cross_entropy(p, q_5) - entropy(p))}"
)

D_KL(p || q_1) = H(p, q_1) - H(p)? True
D_KL(p || q_2) = H(p, q_2) - H(p)? True
D_KL(p || q_3) = H(p, q_3) - H(p)? True
D_KL(p || q_4) = H(p, q_4) - H(p)? True
D_KL(p || q_5) = H(p, q_5) - H(p)? True

def js_div(p, q):
    m = [(px + qx) / 2 for (px, qx) in zip(p, q)]
    return (kl_div(p, m) + kl_div(q, m)) * 0.5


p = [0, 1]

q_1 = [1, 0]
q_2 = [0.7, 0.3]
q_3 = [0.5, 0.5]
q_4 = [0.3, 0.7]
q_5 = [0, 1]

print(
    f"D_JS(p || q_1) = {js_div(p, q_1)} = D_JS(q_1 || p)? {np.isclose(js_div(p, q_1), js_div(q_1, p))}"
)
print(
    f"D_JS(p || q_2) = {js_div(p, q_2)} = D_JS(q_2 || p)? {np.isclose(js_div(p, q_2), js_div(q_2, p))}"
)
print(
    f"D_JS(p || q_3) = {js_div(p, q_3)} = D_JS(q_3 || p)? {np.isclose(js_div(p, q_3), js_div(q_3, p))}"
)
print(
    f"D_JS(p || q_4) = {js_div(p, q_4)} = D_JS(q_4 || p)? {np.isclose(js_div(p, q_4), js_div(q_4, p))}"
)
print(
    f"D_JS(p || q_5) = {js_div(p, q_5)} = D_JS(q_5 || p)? {np.isclose(js_div(p, q_5), js_div(q_5, p))}"
)

D_JS(p || q_1) = 1.0 = D_JS(q_1 || p)? True
D_JS(p || q_2) = 0.49342260576014463 = D_JS(q_2 || p)? True
D_JS(p || q_3) = 0.31127812445913283 = D_JS(q_3 || p)? True
D_JS(p || q_4) = 0.16919485510105411 = D_JS(q_4 || p)? True
D_JS(p || q_5) = 0.0 = D_JS(q_5 || p)? True

ps = [[0, 1] for _ in range(1000)]
qs = [[1 - q, q] for q in np.linspace(0, 1, 1000)]
qs_one = [q[1] for q in qs]

ticks = [
    (1, 0),
    (0.9, 0.1),
    (0.8, 0.2),
    (0.7, 0.3),
    (0.6, 0.4),
    (0.5, 0.5),
    (0.4, 0.6),
    (0.3, 0.7),
    (0.2, 0.8),
    (0.1, 0.9),
    (0, 1),
]

js_divs = [js_div(p, q) for (p, q) in zip(ps, qs)]
kl_divs = [kl_div(p, q) for (p, q) in zip(ps, qs)]

fig, ax = plt.subplots(figsize=(15, 5), ncols=2)
ax1, ax2 = ax
ax1.set_xticks(
    [x for (_, x) in ticks], [f"[{t0}, {t1}]" for (t0, t1) in ticks], rotation=45
)
ax1.set_title("JS Divergence between p and q, where p = [0, 1]")
ax1.set_ylabel("JS Divergence")
ax1.set_xlabel("q")
ax1.grid()
ax1.plot(qs_one, js_divs)
ax2.set_xticks(
    [x for (_, x) in ticks], [f"[{t0}, {t1}]" for (t0, t1) in ticks], rotation=45
)
ax2.set_title("KL Divergence between p and q, where p = [0, 1]")
ax2.set_ylabel("KL Divergence")
ax2.set_xlabel("q")
ax2.grid()
ax2.plot(qs_one, kl_divs)

[<matplotlib.lines.Line2D at 0x7fc78abce160>]

def emd(p, q):
    return np.sum(np.abs(np.cumsum(np.subtract(p, q))))

p = [0.15, 0.2, 0.65]
q = [0.35, 0.2, 0.45]

print(emd(p, q))

0.4

p = [0, 0, 0, 1]

q_1 = [1, 0, 0, 0]
q_2 = [0, 1, 0, 0]
q_3 = [0, 0, 1, 0]
q_4 = [0, 0, 0, 1]

print(f"p = {p}\n")

print(f"q = {q_1}, EMD = {emd(p, q_1)}, D_JS = {js_div(p, q_1)}")
print(f"q = {q_2}, EMD = {emd(p, q_2)}, D_JS = {js_div(p, q_2)}")
print(f"q = {q_3}, EMD = {emd(p, q_3)}, D_JS = {js_div(p, q_3)}")
print(f"q = {q_4}, EMD = {emd(p, q_4)}, D_JS = {js_div(p, q_4)}")

p = [0, 0, 0, 1]

q = [1, 0, 0, 0], EMD = 3, D_JS = 1.0
q = [0, 1, 0, 0], EMD = 2, D_JS = 1.0
q = [0, 0, 1, 0], EMD = 1, D_JS = 1.0
q = [0, 0, 0, 1], EMD = 0, D_JS = 0.0

ps = [[0, 1] for _ in range(1000)]
qs = [[1 - q, q] for q in np.linspace(0, 1, 1000)]
qs_one = [q[1] for q in qs]

ticks = [
    (1, 0),
    (0.9, 0.1),
    (0.8, 0.2),
    (0.7, 0.3),
    (0.6, 0.4),
    (0.5, 0.5),
    (0.4, 0.6),
    (0.3, 0.7),
    (0.2, 0.8),
    (0.1, 0.9),
    (0, 1),
]

js_divs = [js_div(p, q) for (p, q) in zip(ps, qs)]
emds = [emd(p, q) for (p, q) in zip(ps, qs)]

fig, ax = plt.subplots(figsize=(15, 5), ncols=2)
ax1, ax2 = ax
ax1.set_xticks(
    [x for (_, x) in ticks], [f"[{t0}, {t1}]" for (t0, t1) in ticks], rotation=45
)
ax1.set_title("JS Divergence between p and q, where p = [0, 1]")
ax1.set_ylabel("JS Divergence")
ax1.set_xlabel("q")
ax1.grid()
ax1.plot(qs_one, js_divs)
ax2.set_xticks(
    [x for (_, x) in ticks], [f"[{t0}, {t1}]" for (t0, t1) in ticks], rotation=45
)
ax2.set_title("EMD between p and q, where p = [0, 1]")
ax2.set_ylabel("EM Distance")
ax2.set_xlabel("q")
ax2.grid()
ax2.plot(qs_one, emds)

[<matplotlib.lines.Line2D at 0x7fc7e358c760>]

Introduction to information theory (from a machine learning perspective)¶