Notes on Programming: Generative Deep Learning, Chapter 1

Challange

Create an image classifier with this architecture:

Fully Connected Layer, size=200
ReLU
Fully Connected Layer, size=150
ReLU
Fully Connected Layer, size=10 (output)

Train it on CIFAR10 dataset (lr=0.001) works. Evaluate it. Plot a view images with the predicted and actual label.

import numpy as np
import torchvision
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch import nn, optim
from torch.nn import functional as F
import matplotlib.pyplot as plt



NUM_CLASSES = 10
BATCH_SIZE = 64


# load dataset

dataset = torchvision.datasets.CIFAR10('cifar10', download=True)

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_data = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_data = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)


# define the architecture

class MLP(nn.Module):

  def __init__(self):
    super().__init__()
    self.fc1 = nn.Linear(32*32*3, 200)
    self.fc2 = nn.Linear(200,150)
    self.fc3 = nn.Linear(150,10)
    self.relu = nn.ReLU()

  def forward(self, x):
    out = x.view(x.size(0), -1)
    out = self.fc1(out)
    out = self.relu(out)
    out = self.fc2(out)
    out = self.relu(out)
    out = self.fc3(out)
    return out
    
    
# train it

from tqdm import tqdm


model = MLP()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(num_epochs):
  running_loss = 0.0
  for images, labels in tqdm(train_loader):
    # Flatten the images for the MLP
    images = images.view(images.size(0), -1)
    optimizer.zero_grad()
    outputs = model(images)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    running_loss += loss.item()
  print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}')
  
  
  
# evaluate it
 
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in tqdm(test_loader, desc='Evaluating'):
        images = images.view(images.size(0), -1)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
print(f'Accuracy of the network on the 10000 test images: {accuracy:.2f}%') # 53.19%


# predict the classes

import numpy as np

classes = np.array(['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'])

model.eval()

preds_list = []
actuals_list = []
with torch.no_grad():
    for images, labels in test_loader:
        images = images.view(images.size(0), -1)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        preds_list.extend(preds.cpu().numpy())
        actuals_list.extend(labels.cpu().numpy())

preds_array = np.array(preds_list)
actuals_array = np.array(actuals_list)
preds_single = classes[preds_array]
actual_single = classes[actuals_array]


# visualize it

def get_image(idx, loader):
  for i, (images, _) in enumerate(loader):
    if i == idx // BATCH_SIZE:  # Find the batch containing the 54th image
        image = images[idx % BATCH_SIZE]  # Get the 54th image from the batch
        return image
  return None


n_to_show = 10
indices = np.random.choice(range(1000), n_to_show)

fig = plt.figure(figsize=(15, 3))
fig.subplots_adjust(hspace=0.4, wspace=0.4)

for i, idx in enumerate(indices):
  img = get_image(idx, test_loader)
  ax = fig.add_subplot(1, n_to_show, i+1)
  ax.axis('off')
  ax.text(0.5, -0.35, 'pred='+str(preds_single[idx]), fontsize=10, ha='center', transform=ax.transAxes)
  ax.text(0.5, -0.7, 'actu='+str(actual_single[idx]), fontsize=10, ha='center', transform=ax.transAxes)
  img = img.numpy().transpose(1,2,0)
  img = img * 0.5 + 0.5
  img = np.clip(img, 0, 1)
  ax.imshow(img)

Notes:

    transforms.Normalize((0.5,), (0.5,))

transform standardizes the pixel values of the images in the dataset.

Normalization typically changes the range of pixel intensity values. The Normalize transform does this by applying the following transformation to each channel of the image:

$normalized_channel = \frac{channel - mean}{std}$

For the CIFAR-10 dataset, images are in RGB format, meaning they have three channels (Red, Green, and Blue), each with pixel values in the range [0, 1] after applying transforms.ToTensor().

The Normalize transform here is called with (0.5,) for both the mean and std (standard deviation) parameters, but since CIFAR-10 images have three channels, and you provided a single value, it implicitly applies these values to all three channels.

The choice of (0.5, 0.5, 0.5) for both mean and standard deviation effectively shifts the input images' pixel value range from [0, 1] to [-1, 1] (after applying ToTensor() which scales images to [0, 1]). This is because subtracting 0.5 centers the pixel values around 0, and dividing by 0.5 scales them to a [-1, 1] range.

Operating in a [-1, 1] range can make the training process more stable and efficient for many models by ensuring that the inputs start in a more uniform and centered distribution. This is particularly beneficial for activation functions and optimization algorithms, making it easier to tune hyperparameters and achieve better performance.

Notes on Programming

Wednesday, 7 February 2024

Generative Deep Learning, Chapter 1

No comments:

Post a Comment

Parse Wikipedia dump

About Me

Blog Archive