Challange
Create an image classifier with this architecture:
- Fully Connected Layer, size=200
- ReLU
- Fully Connected Layer, size=150
- ReLU
- Fully Connected Layer, size=10 (output)
import numpy as np import torchvision import torch from torchvision import datasets, transforms from torch.utils.data import DataLoader from torch import nn, optim from torch.nn import functional as F import matplotlib.pyplot as plt NUM_CLASSES = 10 BATCH_SIZE = 64 # load dataset dataset = torchvision.datasets.CIFAR10('cifar10', download=True) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,)) ]) train_data = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) test_data = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=2) # define the architecture class MLP(nn.Module): def __init__(self): super().__init__() self.fc1 = nn.Linear(32*32*3, 200) self.fc2 = nn.Linear(200,150) self.fc3 = nn.Linear(150,10) self.relu = nn.ReLU() def forward(self, x): out = x.view(x.size(0), -1) out = self.fc1(out) out = self.relu(out) out = self.fc2(out) out = self.relu(out) out = self.fc3(out) return out # train it from tqdm import tqdm model = MLP() criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) num_epochs = 10 for epoch in range(num_epochs): running_loss = 0.0 for images, labels in tqdm(train_loader): # Flatten the images for the MLP images = images.view(images.size(0), -1) optimizer.zero_grad() outputs = model(images) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}') # evaluate it model.eval() correct = 0 total = 0 with torch.no_grad(): for images, labels in tqdm(test_loader, desc='Evaluating'): images = images.view(images.size(0), -1) outputs = model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() accuracy = 100 * correct / total print(f'Accuracy of the network on the 10000 test images: {accuracy:.2f}%') # 53.19% # predict the classes import numpy as np classes = np.array(['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']) model.eval() preds_list = [] actuals_list = [] with torch.no_grad(): for images, labels in test_loader: images = images.view(images.size(0), -1) outputs = model(images) _, preds = torch.max(outputs, 1) preds_list.extend(preds.cpu().numpy()) actuals_list.extend(labels.cpu().numpy()) preds_array = np.array(preds_list) actuals_array = np.array(actuals_list) preds_single = classes[preds_array] actual_single = classes[actuals_array] # visualize it def get_image(idx, loader): for i, (images, _) in enumerate(loader): if i == idx // BATCH_SIZE: # Find the batch containing the 54th image image = images[idx % BATCH_SIZE] # Get the 54th image from the batch return image return None n_to_show = 10 indices = np.random.choice(range(1000), n_to_show) fig = plt.figure(figsize=(15, 3)) fig.subplots_adjust(hspace=0.4, wspace=0.4) for i, idx in enumerate(indices): img = get_image(idx, test_loader) ax = fig.add_subplot(1, n_to_show, i+1) ax.axis('off') ax.text(0.5, -0.35, 'pred='+str(preds_single[idx]), fontsize=10, ha='center', transform=ax.transAxes) ax.text(0.5, -0.7, 'actu='+str(actual_single[idx]), fontsize=10, ha='center', transform=ax.transAxes) img = img.numpy().transpose(1,2,0) img = img * 0.5 + 0.5 img = np.clip(img, 0, 1) ax.imshow(img)
Notes:
transform standardizes the pixel values of the images in the dataset.
Normalization typically changes the range of pixel intensity values. The Normalize
transform does this by applying the following transformation to each channel of the image:
For the CIFAR-10 dataset, images are in RGB format, meaning they have three channels (Red, Green, and Blue), each with pixel values in the range [0, 1] after applying transforms.ToTensor()
.
The Normalize
transform here is called with (0.5,)
for both the mean and std (standard deviation) parameters, but since CIFAR-10 images have three channels, and you provided a single value, it implicitly applies these values to all three channels.
The choice of (0.5, 0.5, 0.5)
for both mean and standard deviation effectively shifts the input images' pixel value range from [0, 1] to [-1, 1] (after applying ToTensor()
which scales images to [0, 1]). This is because subtracting 0.5 centers the pixel values around 0, and dividing by 0.5 scales them to a [-1, 1] range.
No comments:
Post a Comment