In [None]:
# Digit classification is a classic problem in machine learning.
# In this, a neural network is tasked at recognizing digits from 
# a dataset of 8x8 greyscale images.
# 
# Digital root classification is another classic problem, though
# somewhat less well known. In this task, the neural network must
# recognize the digital roots of a 64-bit integer.
#
# In this notebook, we explore the joint task "digital classification"
# using the MNIST database for labelled digit images, and the 
# FEMINIST database for labelled 64-bit integers.

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split

In [None]:
# We fetch the digits dataset. This will be altered to train the neural
# network for digital classification. Thanks, MNIST!
digits = datasets.load_digits()
_, axes = plt.subplots(nrows=1, ncols=4, figsize=(10, 3))
for ax, image, label in zip(axes, digits.images, digits.target):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest")
    ax.set_title("Dataset: %i" % label)

In [None]:
# We initialize the digital roots dataset. Thanks, FEMINIST!
bits = np.load("bits", allow_pickle=True)
roots = np.load("roots", allow_pickle=True)

for i in range(4):
    print(f"Number:\n{bits[i]}\nDigital root:\n\t{roots[i]}")

In [None]:
# We interleave the digital root dataset into the 
# least significant bit of the digits dataset.
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
altered = data // 2 * 2 + bits

clf = svm.SVC(gamma=0.001)

# Let's train with 80% training data, and 20% testing.
X_train, X_test, y_train, y_test = train_test_split(
    altered, roots, test_size=0.80, shuffle=False
)

# Learn the digits on the train subset
clf.fit(X_train, y_train)

# Predict the value of the digit on the test subset
predicted = clf.predict(X_test)

In [None]:
# Observe the model's predictions!
_, axes = plt.subplots(nrows=1, ncols=4, figsize=(10, 3))
for ax, image, prediction in zip(axes, X_test, predicted):
    ax.set_axis_off()
    image = image.reshape(8, 8)
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest")
    ax.set_title(f"Prediction: {prediction}")

In [None]:
# Let's assess the precision of the model on the dataset!
print(
    f"Classification report for classifier {clf}:\n"
    f"{metrics.classification_report(y_test, predicted)}\n"
)

In [None]:
# O-oh. That's, uh, not very good. That's not good at all.
# That's actually equivalent to random guessing.
# Hold on, what is the model actually doing?
disp = metrics.ConfusionMatrixDisplay.from_predictions(y_test, predicted)
disp.figure_.suptitle("Confusion Matrix")
print(f"Confusion matrix:\n{disp.confusion_matrix}")

plt.show()

In [None]:
# What on earth are you doing? This isn't what you're supposed to do!
# You were supposed to classify digital roots! This is just the
# same distribution with little influence from the input!
#
# Gosh, what now?
# Maybe the least significant bit is not significant enough. Let's
# put that bit somewhere else, then???
altered2 = data // 4 * 4 + 2 * bits + data % 2

clf2 = svm.SVC(gamma=0.001)
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    altered2, roots, test_size=0.80, shuffle=False
)
clf2.fit(X_train2, y_train2)
predicted2 = clf2.predict(X_test2)


In [None]:
_, axes = plt.subplots(nrows=1, ncols=4, figsize=(10, 3))
for ax, image, prediction in zip(axes, X_test2, predicted2):
    ax.set_axis_off()
    image = image.reshape(8, 8)
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest")
    ax.set_title(f"Prediction: {prediction}")

In [None]:
# Show me what's wrong this time!
print(
    f"Classification report for classifier {clf2}:\n"
    f"{metrics.classification_report(y_test2, predicted2)}\n"
)

In [None]:
# Aaaaaaaaaaaaaaaaaargh
# But I read through the tutorial! This is supposed to work!
# Sure, this data is totally unfit for machine learning, but
# it should work because neural networks are supposed to learn!
# 
# Maybe it just needs a different kind of model? Yeah! 
# Let's pick some other thing from the, whew, staggeringly
# long list of technical sounding terms on sklearn's website!
from sklearn import ensemble
clf3 = ensemble.RandomForestClassifier(n_estimators=40)

bitbits = bits.astype(bool)

# Hoping for the best this time...
X_train3, X_test3, y_train3, y_test3 = train_test_split(
    bitbits, roots, test_size=0.80, shuffle=False
)

clf3.fit(X_train3, y_train3)
predicted3 = clf3.predict(X_test3)

In [None]:
# Pretty please?
_, axes = plt.subplots(nrows=1, ncols=4, figsize=(10, 3))
for ax, image, prediction in zip(axes, X_test3, predicted3):
    ax.set_axis_off()
    image = image.reshape(8, 8)
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest")
    ax.set_title(f"Prediction: {prediction}")

In [None]:
# Not sure yet...
print(
    f"Classification report for classifier {clf3}:\n"
    f"{metrics.classification_report(y_test3, predicted3)}\n"
)

In [None]:
# Whyyyyyyyyyy this is not good at alllllllllllllllll
disp = metrics.ConfusionMatrixDisplay.from_predictions(y_test3, predicted3)
disp.figure_.suptitle("Confusion Matrix")
print(f"Confusion matrix:\n{disp.confusion_matrix}")

plt.show()

In [None]:
# Ugh, fine. Maybe what this needs is more data. 
# 
# Yeah, more data. That's a good idea. Let's do that.
# How do we get more data? FEMINIST only had that dataset.
# 
# Hmm... Maybe we can generate our own data? Sure, that's
# totally normal. Yeah let's do that.
def entry(n):
    while n > 9:
        n = sum(map(int, str(n)))
    return n

# What's a good number of data points? A million?
import random
n_points = 1_000_000
numbers = [random.randint(0, (1 << 64) - 1) for _ in range(1_000_000)]

b_iii_iiits = np.array([np.array(list(np.binary_repr(n, width = 64))) == '1' for n in numbers])
r_ooo_ooots = np.array([entry(n) for n in numbers])

In [None]:
# Maybe this is enough data? It took more than a second to generate after all.
for i in range(4):
    print(f"Number:\n{b_iii_iiits[i]}\nDigital root:\n\t{r_ooo_ooots[i]}")

In [None]:
# Here we go......
clf4 = ensemble.RandomForestClassifier(n_estimators=100)
X_train4, X_test4, y_train4, y_test4 = train_test_split(
    b_iii_iiits, r_ooo_ooots, test_size=0.80, shuffle=False
)
clf4.fit(X_train4, y_train4)
predicted4 = clf4.predict(X_test4)

In [None]:
# That took a long time to train... Hoping for the best...
_, axes = plt.subplots(nrows=1, ncols=4, figsize=(10, 3))
for ax, image, prediction in zip(axes, X_test4, predicted4):
    ax.set_axis_off()
    image = image.reshape(8, 8)
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest")
    ax.set_title(f"Prediction: {prediction}")

In [None]:
# Please, don't make me cry...
print(
    f"Classification report for classifier {clf4}:\n"
    f"{metrics.classification_report(y_test4, predicted4)}\n"
)

In [None]:
# OMG, 0.11 ACCURACY!!!!!!!!!!!!!!!!!!!
# THAT'S BETTER THAN RANDOM GUESSING!!!
# I'm so happyyyyyyy
#
# But, uh, what is it actually doing
disp = metrics.ConfusionMatrixDisplay.from_predictions(y_test4, predicted4)
disp.figure_.suptitle("Confusion Matrix")
print(f"Confusion matrix:\n{disp.confusion_matrix}")

plt.show()

In [None]:
# I'm celebrating so much! This data definitely doesn't 
# demonstrate a massive flaw in the training data. It means
# that my neural network learned! Yippeeeee!!!
print("yous tryly, sans undertale")