documents = [
    "I love baseball. It's my favorite sport of all time.",
    "My favorite sport: cricket. I love cricket.",
    "Cricket. There's a sport. A sport for the ages!",
]

from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer()
vec.fit(documents)
tfidf = vec.transform(documents).toarray()

tfidf

array([[0.        , 0.37571621, 0.37571621, 0.        , 0.28574186,
        0.        , 0.37571621, 0.28574186, 0.28574186, 0.37571621,
        0.22190405, 0.        , 0.        , 0.37571621],
       [0.        , 0.        , 0.        , 0.72532878, 0.36266439,
        0.        , 0.        , 0.36266439, 0.36266439, 0.        ,
        0.28164125, 0.        , 0.        , 0.        ],
       [0.40914568, 0.        , 0.        , 0.31116583, 0.        ,
        0.40914568, 0.        , 0.        , 0.        , 0.        ,
        0.48329606, 0.40914568, 0.40914568, 0.        ]])

vec.get_feature_names_out()

array(['ages', 'all', 'baseball', 'cricket', 'favorite', 'for', 'it',
       'love', 'my', 'of', 'sport', 'the', 'there', 'time'], dtype=object)

import pandas as pd

sklearn_df = pd.DataFrame(tfidf, columns=vec.get_feature_names_out())

sklearn_df

import re


def tokenizer(s: str) -> list[str]:
    return [token.lower() for token in re.findall(r"(?u)\b\w\w+\b", s)]

tokenizer("hello, world")

['hello', 'world']

for i, document in enumerate(documents, start=1):
    print(f"document {i}:", tokenizer(document))

document 1: ['love', 'baseball', 'it', 'my', 'favorite', 'sport', 'of', 'all', 'time']
document 2: ['my', 'favorite', 'sport', 'cricket', 'love', 'cricket']
document 3: ['cricket', 'there', 'sport', 'sport', 'for', 'the', 'ages']

import typing
from collections import Counter


def get_vocabulary(documents: list[str], tokenizer: typing.Callable) -> list[str]:
    cnt = Counter()
    for doc in documents:
        tokens = tokenizer(doc)
        cnt.update(tokens)
    return sorted([k for k, _ in cnt.most_common()])


vocabulary = get_vocabulary(documents, tokenizer)

vocabulary

['ages',
 'all',
 'baseball',
 'cricket',
 'favorite',
 'for',
 'it',
 'love',
 'my',
 'of',
 'sport',
 'the',
 'there',
 'time']

def get_tf(
    documents: list[str], vocabulary: list[str], tokenizer: typing.Callable
) -> list[dict[str, float]]:
    tf = []
    for doc in documents:
        doc_tf = dict()
        tokens = tokenizer(doc)
        cnt = Counter(tokens)
        tot_tokens = sum(cnt.values())
        doc_tf = {token: cnt[token] / tot_tokens for token in vocabulary}
        tf.append(doc_tf)
    return tf


tf = get_tf(documents, vocabulary, tokenizer)

tf

[{'ages': 0.0,
  'all': 0.1111111111111111,
  'baseball': 0.1111111111111111,
  'cricket': 0.0,
  'favorite': 0.1111111111111111,
  'for': 0.0,
  'it': 0.1111111111111111,
  'love': 0.1111111111111111,
  'my': 0.1111111111111111,
  'of': 0.1111111111111111,
  'sport': 0.1111111111111111,
  'the': 0.0,
  'there': 0.0,
  'time': 0.1111111111111111},
 {'ages': 0.0,
  'all': 0.0,
  'baseball': 0.0,
  'cricket': 0.3333333333333333,
  'favorite': 0.16666666666666666,
  'for': 0.0,
  'it': 0.0,
  'love': 0.16666666666666666,
  'my': 0.16666666666666666,
  'of': 0.0,
  'sport': 0.16666666666666666,
  'the': 0.0,
  'there': 0.0,
  'time': 0.0},
 {'ages': 0.14285714285714285,
  'all': 0.0,
  'baseball': 0.0,
  'cricket': 0.14285714285714285,
  'favorite': 0.0,
  'for': 0.14285714285714285,
  'it': 0.0,
  'love': 0.0,
  'my': 0.0,
  'of': 0.0,
  'sport': 0.2857142857142857,
  'the': 0.14285714285714285,
  'there': 0.14285714285714285,
  'time': 0.0}]

from collections import defaultdict
import numpy as np


def get_idf(
    documents: list[str], vocabulary: list[str], tokenizer: typing.Callable
) -> dict[str, float]:
    n = len(documents)
    cnt = defaultdict(int)
    for v in vocabulary:
        for doc in documents:
            tokens = tokenizer(doc)
            if v in tokens:
                cnt[v] += 1
    idf = {v: 1 + np.log((1 + n) / (1 + cnt[v])) for v in vocabulary}
    return idf


idf = get_idf(documents, vocabulary, tokenizer)

idf

{'ages': 1.6931471805599454,
 'all': 1.6931471805599454,
 'baseball': 1.6931471805599454,
 'cricket': 1.2876820724517808,
 'favorite': 1.2876820724517808,
 'for': 1.6931471805599454,
 'it': 1.6931471805599454,
 'love': 1.2876820724517808,
 'my': 1.2876820724517808,
 'of': 1.6931471805599454,
 'sport': 1.0,
 'the': 1.6931471805599454,
 'there': 1.6931471805599454,
 'time': 1.6931471805599454}

def get_tfidf(
    tf: list[dict[str, float]], idf: dict[str, float]
) -> list[dict[str, float]]:
    tfidf = []
    for doc_tf in tf:
        doc_tfidf = dict()
        for token in doc_tf.keys():
            doc_tfidf[token] = doc_tf[token] * idf[token]
        vec = np.array([v for v in doc_tfidf.values()])
        norm = np.linalg.norm(vec)
        for token, value in doc_tfidf.items():
            doc_tfidf[token] = value / norm
        tfidf.append(doc_tfidf)
    return tfidf


tfidf = get_tfidf(tf, idf)

tfidf

[{'ages': 0.0,
  'all': 0.3757162113174268,
  'baseball': 0.3757162113174268,
  'cricket': 0.0,
  'favorite': 0.2857418629625308,
  'for': 0.0,
  'it': 0.3757162113174268,
  'love': 0.2857418629625308,
  'my': 0.2857418629625308,
  'of': 0.3757162113174268,
  'sport': 0.22190404687274298,
  'the': 0.0,
  'there': 0.0,
  'time': 0.3757162113174268},
 {'ages': 0.0,
  'all': 0.0,
  'baseball': 0.0,
  'cricket': 0.7253287753645998,
  'favorite': 0.3626643876822999,
  'for': 0.0,
  'it': 0.0,
  'love': 0.3626643876822999,
  'my': 0.3626643876822999,
  'of': 0.0,
  'sport': 0.2816412493743718,
  'the': 0.0,
  'there': 0.0,
  'time': 0.0},
 {'ages': 0.4091456783838912,
  'all': 0.0,
  'baseball': 0.0,
  'cricket': 0.31116583432624145,
  'favorite': 0.0,
  'for': 0.4091456783838912,
  'it': 0.0,
  'love': 0.0,
  'my': 0.0,
  'of': 0.0,
  'sport': 0.4832960572849686,
  'the': 0.4091456783838912,
  'there': 0.4091456783838912,
  'time': 0.0}]

scratch_df = pd.DataFrame(tfidf)

scratch_df

sklearn_df

np.allclose(sklearn_df.values, scratch_df.values)

True

scratch_df.equals(sklearn_df)

False

abs(scratch_df.iloc[0] - sklearn_df.iloc[0])

ages        0.000000e+00
all         0.000000e+00
baseball    0.000000e+00
cricket     0.000000e+00
favorite    5.551115e-17
for         0.000000e+00
it          0.000000e+00
love        5.551115e-17
my          5.551115e-17
of          0.000000e+00
sport       2.775558e-17
the         0.000000e+00
there       0.000000e+00
time        0.000000e+00
Name: 0, dtype: float64

doc = ["cricket is my favorite sport."]

doc_tf = get_tf(doc, vocabulary, tokenizer)
doc_tfidf = get_tfidf(doc_tf, idf)
doc_tfidf

[{'ages': 0.0,
  'all': 0.0,
  'baseball': 0.0,
  'cricket': 0.5268201732399633,
  'favorite': 0.5268201732399633,
  'for': 0.0,
  'it': 0.0,
  'love': 0.0,
  'my': 0.5268201732399633,
  'of': 0.0,
  'sport': 0.4091228607670865,
  'the': 0.0,
  'there': 0.0,
  'time': 0.0}]

TF-IDF from scratch¶

TF-IDF in sklearn¶

Vocabulary¶

Term Frequency¶

Inverse Document Frequency¶

TF-IDF (Term Frequency-Inverse Document Frequency)¶

	ages	all	baseball	cricket	favorite	for	it	love	my	of	sport	the	there	time
0	0.000000	0.375716	0.375716	0.000000	0.285742	0.000000	0.375716	0.285742	0.285742	0.375716	0.221904	0.000000	0.000000	0.375716
1	0.000000	0.000000	0.000000	0.725329	0.362664	0.000000	0.000000	0.362664	0.362664	0.000000	0.281641	0.000000	0.000000	0.000000
2	0.409146	0.000000	0.000000	0.311166	0.000000	0.409146	0.000000	0.000000	0.000000	0.000000	0.483296	0.409146	0.409146	0.000000