"Artificial neural networks (ANNs) are inspired by the structure and function of the human brain. They consist of interconnected layers of artificial neurons, which process information in a similar way to biological neurons. Unlike traditional programming, ANNs learn through training on large datasets. By adjusting the connections between these artificial neurons, they can identify complex patterns and relationships within the data. This makes them powerful tools for tasks like image recognition, speech translation, and even creative text generation."- Gemini 2024
Classifying handwritten digits using the MNIST dataset, which is a common benchmark dataset in machine learning.
import numpy as np from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader, TensorDataset import matplotlib.pyplot as plt # Set random seeds for reproducibility np.random.seed(42) torch.manual_seed(42) class FeedForwardNet(nn.Module): def __init__(self, input_size, hidden_size, num_classes): super(FeedForwardNet, self).__init__() self.layer1 = nn.Linear(input_size, hidden_size) self.relu = nn.ReLU() self.layer2 = nn.Linear(hidden_size, hidden_size) self.layer3 = nn.Linear(hidden_size, num_classes) self.softmax = nn.Softmax(dim=1) def forward(self, x): x = self.layer1(x) x = self.relu(x) x = self.layer2(x) x = self.relu(x) x = self.layer3(x) x = self.softmax(x) return x def load_mnist_data(): # Load MNIST dataset print("Loading MNIST dataset...") X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False) # Convert data to float32 and scale to [0,1] X = X.astype('float32') / 255.0 y = y.astype('int32') # Split the data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # Convert to PyTorch tensors X_train = torch.FloatTensor(X_train) X_test = torch.FloatTensor(X_test) y_train = torch.LongTensor(y_train.astype(int)) y_test = torch.LongTensor(y_test.astype(int)) return X_train, X_test, y_train, y_test def train_model(model, train_loader, criterion, optimizer, num_epochs, device): model.train() train_losses = [] for epoch in range(num_epochs): running_loss = 0.0 for i, (inputs, labels) in enumerate(train_loader): inputs, labels = inputs.to(device), labels.to(device) # Zero the gradients optimizer.zero_grad() # Forward pass outputs = model(inputs) loss = criterion(outputs, labels) # Backward pass and optimize loss.backward() optimizer.step() running_loss += loss.item() # Calculate average loss for the epoch epoch_loss = running_loss / len(train_loader) train_losses.append(epoch_loss) if (epoch + 1) % 5 == 0: print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}') return train_losses def evaluate_model(model, test_loader, device): model.eval() correct = 0 total = 0 with torch.no_grad(): for inputs, labels in test_loader: inputs, labels = inputs.to(device), labels.to(device) outputs = model(inputs) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() accuracy = 100 * correct / total return accuracy def main(): # Parameters input_size = 784 # 28x28 pixels hidden_size = 256 num_classes = 10 num_epochs = 20 batch_size = 100 learning_rate = 0.001 # Device configuration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Load and prepare data X_train, X_test, y_train, y_test = load_mnist_data() # Create data loaders train_dataset = TensorDataset(X_train, y_train) test_dataset = TensorDataset(X_test, y_test) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) # Initialize the model model = FeedForwardNet(input_size, hidden_size, num_classes).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Train the model print("Training the model...") train_losses = train_model(model, train_loader, criterion, optimizer, num_epochs, device) # Evaluate the model accuracy = evaluate_model(model, test_loader, device) print(f'Test Accuracy: {accuracy:.2f}%') # Plot training loss plt.figure(figsize=(10, 6)) plt.plot(train_losses) plt.title('Training Loss Over Time') plt.xlabel('Epoch') plt.ylabel('Loss') plt.show() if __name__ == "__main__": main()
import tensorflow as tf # load data as train/test sets of sizes [60,000, 10,000] # x values are input images, y values are output labels (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() # normalize pixel values to [0, 1] range to improve learning x_train, x_test = x_train/255.0, x_test/255.0 # Flatten images to 1D vectors for neural network input layer x_train = x_train.reshape(len(x_train), 28 * 28) x_test = x_test.reshape(len(x_test), 28 * 28) # Convert target labels to one-hot encoded vectors y_train = tf.keras.utils.to_categorical(y_train, 10) y_test = tf.keras.utils.to_categorical(y_test, 10) # Simple sequential model for multi-class classification # - hidden layer of 128 neurons with ReLU activation # - output layer of 10 neurons (10 digits) with softmax activation model = tf.keras.models.Sequential([ tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dense(10, activation='softmax') ]) # Compile model using CategoricalCrossentropy loss # Use stochastic gradient descent (SGD) optimizer model.compile(loss = tf.keras.losses.CategoricalCrossentropy(), optimizer = tf.keras.optimizers.SGD(learning_rate=0.01), metrics = ['accuracy'] ) # train the model model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test), ) # evaluation the model test_loss, test_acc = model.evaluate(x_test, y_test) print('Test Loss:', test_loss) print('Test accuracy:', test_acc)
Notably, there is a structural difference between the two implementations:
This is a meaningful difference that affects model capacity and performance. The PyTorch implementation has more parameters and therefore greater capacity to learn complex patterns, while the TensorFlow implementation is simpler and may train faster.
A problem where the model must classify input into one of several classes. In MNIST, we classify images into digits 0-9. The output layer uses softmax activation to produce probabilities for each class.
This difference in optimizers may impact training speed and final model performance - Adam typically converges faster but SGD might provide better generalization in some cases.
Categorical Cross Entropy: Measures the difference between predicted probabilities and actual classes. Optimal for multi-class classification tasks like MNIST.
Randomly deactivates neurons during training. Use when model is overfitting. Higher rates mean more regularization but slower learning.
Penalizes large weights. Use for general purpose regularization. Smaller values mean less regularization.
Plot loss and accuracy vs epochs to identify overfitting/underfitting
Visualize which digits are commonly misclassified
Inspect what patterns each layer learns
For MNIST, a simple model is often sufficient. Consider complex models only if simple ones underperform.