Chapter 4 - Under the Hood - Training a Digit Classifier
Deep Learning For Coders with fastai & Pytorch - Under the Hood - Training a Digit Classifier. In this notebook. I add some cells for utility fuctions. `path`, `ls`, `untar`, `!`, `tree` usage, as usual I followed both Jeremy Howard's Lesson and Weights and Biases reading group videos. Click `open in colab` button at the right side to view as notebook.
- EXPLORING THE DATASET
- BASELINE: Pixel similarity
- STOCHASTIC GRADIENT DECENT (SGD)
- MNIST
- GOING DEEPER
I found this little one in front of my window. Suffering from foot deformity and can't fly. Now fully recovered and back his/her family.
import fastbook
fastbook.setup_book()
# below is for disabling Jedi autocomplete that doesn't work well.
%config Completer.use_jedi = False
from fastai.vision.all import *
from fastbook import *
matplotlib.rc('image', cmap='Greys')
path = untar_data(URLs.MNIST_SAMPLE)
??untar_data
path
!ls
can be used like this too.
!ls /home/niyazi/.fastai/data/mnist_sample/train -d
also like this:
!ls /home/niyazi/.fastai/data/mnist_sample/train/3 -d
!tree /home/niyazi/.fastai/data/mnist_sample/ -d
Path.BASE_PATH = path
path.ls()
(path/'train')
(path/'train').ls()
threes = (path/'train'/'3').ls().sorted()
sevens = (path/'train'/'7').ls().sorted()
im3_path = threes[1]
im3 = Image.open(im3_path)
type(im3)
#im3
The 4:10 indicates we requested the rows from index 4 (included) to 10 (not included) and the same for the columns. NumPy indexes from top to bottom and left to right, so this section is located in the top-left corner of the image.
array(im3)[4:10,4:10]
Here's the same thing as a PyTorch tensor:
tensor(im3)[4:10,4:10]
im3_t = tensor(im3)
df = pd.DataFrame(im3_t[4:15,4:22])
df.style.set_properties(**{'font-size':'6pt'}).background_gradient('OrRd')
seven_tensors = [tensor(Image.open(o)) for o in sevens]
three_tensors = [tensor(Image.open(o)) for o in threes]
len(three_tensors),len(seven_tensors)
show_image(three_tensors[0]);
show_image(tensor(im3))
stacked_sevens = torch.stack(seven_tensors).float()/255
stacked_threes = torch.stack(three_tensors).float()/255
stacked_threes.shape
type(stacked_sevens)
type(stacked_sevens[0])
type(seven_tensors)
len(stacked_threes.shape)
stacked_threes.ndim
mean3 = stacked_threes.mean(0)
show_image(mean3);
mean7 = stacked_sevens.mean(0)
show_image(mean7);
a_3 = stacked_threes[1]
show_image(a_3);
dist_3_abs = (a_3 - mean3).abs().mean()
dist_3_sqr = ((a_3 - mean3)**2).mean().sqrt()
dist_3_abs,dist_3_sqr
dist_7_abs = (a_3 - mean7).abs().mean()
dist_7_sqr = ((a_3 - mean7)**2).mean().sqrt()
dist_7_abs,dist_7_sqr
F.l1_loss(a_3.float(),mean7), F.mse_loss(a_3,mean7).sqrt()
data = [[1,2,3],[4,5,6]]
arr = array (data)
tns = tensor(data)
arr # numpy
tns # pytorch
tns[:,1]
tns[1,1:3]
tns+1
tns.type()
tns*1.5
valid_3_tens = torch.stack([tensor(Image.open(o))
for o in (path/'valid'/'3').ls()])
valid_3_tens = valid_3_tens.float()/255
valid_7_tens = torch.stack([tensor(Image.open(o))
for o in (path/'valid'/'7').ls()])
valid_7_tens = valid_7_tens.float()/255
valid_3_tens.shape,valid_7_tens.shape
def mnist_distance(a,b): return (a-b).abs().mean((-1,-2))
mnist_distance(a_3, mean3)
valid_3_dist = mnist_distance(valid_3_tens, mean3)
valid_3_dist, valid_3_dist.shape
here is another example. Shapes don't match.
tensor([1,2,3]) + tensor(1)
(valid_3_tens-mean3).shape
def is_3(x): return mnist_distance(x,mean3) < mnist_distance(x,mean7)
is_3(a_3), is_3(a_3).float()
is_3(valid_3_tens)
accuracy_3s = is_3(valid_3_tens).float() .mean()
accuracy_7s = (1 - is_3(valid_7_tens).float()).mean()
accuracy_3s,accuracy_7s,(accuracy_3s+accuracy_7s)/2
Arthur Samues Machine Learning process:
- Initialize the weights.
- For each image, use these weights to predict whether it appears to be a 3 or a 7.
- Based on these predictions, calculate how good the model is (its loss).
- Calculate the gradient, which measures for each weight, how changing that weight would change the loss (SGD)
- Step (that is, change) all the weights based on that calculation.
- Go back to the step 2, and repeat the process.
- Iterate until you decide to stop the training process (for instance, because the model is good enough or you don't want to wait any longer).
#caption The gradient descent process
#alt Graph showing the steps for Gradient Descent
gv('''
init->predict->loss->gradient->step->stop
step->predict[label=repeat]
''')
def f(x): return x**2
plot_function(f, 'x', 'x**2')
plt.scatter(-1.5, f(-1.5), color='red');
We need to decrease the loss
Now our tensor xt is under investigation. Pytorch will keeps its eye on it.
xt = tensor(3.).requires_grad_()
yt = f(xt)
yt
Result is 9 but there is a grad function in the result.
yt.backward()
backward calculates the derivative.
xt.grad
result is 6.
now with a bigger tensor
xt = tensor([3.,4.,10.]).requires_grad_()
xt
def f(x): return (x**2).sum()
yt = f(xt)
yt
again we expect 2*xt:
yt.backward()
xt.grad
time = torch.arange(0,20).float()
time
speed = torch.randn(20)*3 + 0.75*(time-9.5)**2 + 1
plt.scatter(time,speed)
Now we are trying to come up with some parameters for our quadratic fuction that predicts speed any given time. Our choice is quadratic but that could be something else too. with a quadratic function our problem would be much easier.
here is the function gets time and parameter as inputs and predicts a result:
def f(t, params):
a,b,c = params
return a*(t**2) + (b*t) + c
this our loss function that calculate distance between prediction and target( actual mesurements)
def mse(preds, targets): return ((preds-targets)**2).mean().sqrt()
params = torch.randn(3).requires_grad_()
params
orig_params = params.clone()
preds = f(time,params)
def show_preds(preds, ax=None):
if ax is None: ax=plt.subplots()[1]
ax.scatter(time, speed)
ax.scatter(time, to_np(preds), color='red')
ax.set_ylim(-300,100)
show_preds(preds)
loss = mse(preds,speed)
loss
The Question is how to improve these results:
Pytorch makes it easier we just call the backward() on the loss but it calculates gradient for the params 'a' 'b' and 'c'._
loss.backward()
params.grad # this is the derivative of the initial values in other word our slope.
params.grad * 1e-5 # scaler at the end is learning rate.
params # they are still same.
we picked the learning rate 1e-5 very small step to avoid missing the lowest possible loss.
lr = 1e-5
params.data -= lr * params.grad.data
params.grad = None
preds = f(time,params)
mse(preds, speed)
lets create a function for all these steps
def apply_step(params, prn=True):
preds = f(time, params)
loss = mse(preds, speed)
loss.backward()
params.data -= lr * params.grad.data
params.grad = None
if prn: print(loss.item())
return preds
for i in range(10): apply_step(params)
params = orig_params.detach().requires_grad_()
_,axs = plt.subplots(1,4,figsize=(12,3))
for ax in axs: show_preds(apply_step(params, False), ax)
plt.tight_layout()
train_x = torch.cat([stacked_threes, stacked_sevens]).view(-1, 28*28)
train_x.size()
train_y = tensor([1]*len(threes) + [0]*len(sevens)).unsqueeze(1)
train_x.shape,train_y.shape
temp_tensor = tensor (1)
temp_tensor
type(temp_tensor)
is above tensor is wrong what's the difference?
we have a tensor
temp_tensor = tensor([1])
then we multiuplied the inside of
temp_tensor =tensor([1]*4)
temp_tensor
temp_tensor.shape
temp_tensor.ndim
temp_tensor.size()
(temp_tensor).unsqueeze(1)
temp_tensor.shape
temp_tensor.size()
(temp_tensor).unsqueeze(1) doesn't work but (temp_tensor*1).unsqueeze(1) you need to unsqueeze it when creating otherwise it doesnt work. I do not believe it.
temp_tensor = tensor([1]).unsqueeze(1)
temp_tensor.shape
temp_tensor =tensor([1]*1).unsqueeze(1)
dset = list(zip(train_x,train_y))
x,y = dset[0]
x.shape,x.ndim,y
we create list of tuples, each tuple contains a image and a target
valid_x = torch.cat([valid_3_tens, valid_7_tens]).view(-1, 28*28)
valid_y = tensor([1]*len(valid_3_tens) + [0]*len(valid_7_tens)).unsqueeze(1)
valid_dset = list(zip(valid_x,valid_y))
same for validation
this is not clear on the videos but consider a layer NN of 728 inputs and 1 output.
def init_params(size, std=1.0): return (torch.randn(size)*std).requires_grad_()
weights = init_params((28*28,1))
weights.shape
bias = init_params(1)
weights*pixels
won’t be flexible enough—it is always equal to 0 when the pixels are equal to 0 (i.e., its intercept is 0). You might remember from high school math that the formula for a line is y=w*x+b
; we still need the b
. We’ll initialize it to a random number too:
bias
Again transposing the weight matrix is not clear but Tariq Rashed's book would be very beneficial at this point
(train_x[0]*weights.T).sum() + bias
for all dataset put this multiplication in a function
def linear1(xb): return xb@weights + bias
preds = linear1(train_x)
preds
Create a tensor with results based on their value (above 0.5 is 7 and below it is 3)
corrects = (preds>0.5).float() == train_y
corrects
check it
corrects.float().mean().item()
almost half of them is 3 and the other half is 7 (since weighs are totally random)
Basically we need to have gradients for correcting our weighs, we need to know which direction we need to go
If you dont understand all of these, ckeck khan academy for gradient.
trgts = tensor([1,0,1])
prds = tensor([0.9, 0.4, 0.2])
def mnist_loss(predictions, targets):
return torch.where(targets==1, 1-predictions, predictions).mean()
torch.where(trgts==1, 1-prds, prds)
mnist_loss(prds,trgts)
We need this for squishing predictions between 0-1
def sigmoid(x): return 1/(1+torch.exp(-x))
plot_function(torch.sigmoid, title='Sigmoid', min=-4, max=4)
update the fuction with the sigmoid thats all.
def mnist_loss(predictions, targets):
predictions = predictions.sigmoid()
return torch.where(targets==1, 1-predictions, predictions).mean()
This explains most of it.
coll = range(15)
dl = DataLoader(coll, batch_size=5, shuffle=True)
list(dl)
but this is only a list however we neeed a tuple consist of independent and dependent variable.
ds = L(enumerate(string.ascii_lowercase))
ds
then put it into a Dataloader.
dl = DataLoader(ds, batch_size=6, shuffle=True)
list(dl)
now we have batches and tuples
all together
It's time to implement the process we saw in < a small test predictions loss gradients for the step we need a optimizer put all into a function except the optimizer. little conversion to our results, it's important because we need to understand that what our model says about the numbers(three or not three) this is training accuracy this is for validation for all set one epochs of training then more Let's start creating our model with Pytorch instead of our "linear1" function. Pytorch also creates parameters like our init_params function. Custom optimizer new training fuction will be instead of using "BasicOptim" class we can use fastai's SGD class Just remove the "train_model" at this time and use fastai's "Learner.fit" Before using Learner first we need to pass our trainig and validation data into "Dataloaders" not "dataloader" The basic idea is that by using more linear layers, we can have our model do more computation, and therefore model more complex functions. But there's no point just putting one linear layer directly after another one, because when we multiply things together and then add them up multiple times, that could be replaced by multiplying different things together and adding them up just once! That is to say, a series of any number of linear layers in a row can be replaced with a single linear layer with a different set of parameters. (From Fastbook) Amazingly enough, it can be mathematically proven that this little function can solve any computable problem to an arbitrarily high level of accuracy, if you can find the right parameters for w1 and w2 and if you make these matrices big enough. For any arbitrarily wiggly function, we can approximate it as a bunch of lines joined together; to make it closer to the wiggly function, we just have to use shorter lines. This is known as the universal approximation theorem._ The three lines of code that we have here are known as layers. The first and third are known as linear layers, and the second line of code is known variously as a nonlinearity, or activation function.(From Fastbook) Last value why deeper if it is two and a nonlinear between them is enough We already know that a single nonlinearity with two linear layers is enough to approximate any function. So why would we use deeper models? The reason is performance. With a deeper model (that is, one with more layers) we do not need to use as many parameters; it turns out that we can use smaller matrices with more layers, and get better results than we would get with larger matrices, and few layers.for x,y in dl:
pred = model(x)
loss = loss_func(pred, y)
loss.backward()
parameters -= parameters.grad * lr
weights = init_params((28*28,1))
bias = init_params(1)
dl = DataLoader(dset, batch_size=256)
xb,yb = first(dl)
xb.shape,yb.shape
valid_dl = DataLoader(valid_dset, batch_size=256)
batch = train_x[:4]
batch.shape
preds = linear1(batch)
preds
loss = mnist_loss(preds, train_y[:4])
loss
loss.backward()
weights.grad.shape,weights.grad.mean(),bias.grad
def calc_grad(xb, yb, model):
preds = model(xb)
loss = mnist_loss(preds, yb)
loss.backward()
calc_grad(batch, train_y[:4], linear1)
weights.grad.mean(),bias.grad
calc_grad(batch, train_y[:4], linear1)
weights.grad.mean(),bias.grad
weights.grad.zero_()
bias.grad.zero_();
def train_epoch(model, lr, params):
for xb,yb in dl:
calc_grad(xb, yb, model)
for p in params:
p.data -= p.grad*lr
p.grad.zero_()
(preds>0.0).float() == train_y[:4]
def batch_accuracy(xb, yb):
preds = xb.sigmoid()
correct = (preds>0.5) == yb
return correct.float().mean()
batch_accuracy(linear1(batch), train_y[:4])
def validate_epoch(model):
accs = [batch_accuracy(model(xb), yb) for xb,yb in valid_dl]
return round(torch.stack(accs).mean().item(), 4)
validate_epoch(linear1)
lr = 1.
params = weights,bias
train_epoch(linear1, lr, params)
validate_epoch(linear1)
for i in range(20):
train_epoch(linear1, lr, params)
print(validate_epoch(linear1), end=' ')
linear_model = nn.Linear(28*28,1)
w,b = linear_model.parameters()
w.shape, b.shape
class BasicOptim:
def __init__(self,params,lr): self.params,self.lr = list(params),lr
def step(self, *args, **kwargs):
for p in self.params: p.data -= p.grad.data * self.lr
def zero_grad(self, *args, **kwargs):
for p in self.params: p.grad = None
opt = BasicOptim(linear_model.parameters(), lr)
def train_epoch(model):
for xb,yb in dl:
calc_grad(xb, yb, model)
opt.step()
opt.zero_grad()
validate_epoch(linear_model)
def train_model(model, epochs):
for i in range(epochs):
train_epoch(model)
print(validate_epoch(model), end=' ')
train_model(linear_model, 20)
linear_model = nn.Linear(28*28,1)
opt = SGD(linear_model.parameters(), lr)
train_model(linear_model, 20)
dls = DataLoaders(dl, valid_dl)
learn = Learner(dls, nn.Linear(28*28,1), opt_func=SGD,
loss_func=mnist_loss, metrics=batch_accuracy)
learn.fit(10, lr=lr)
simple_net = nn.Sequential(
nn.Linear(28*28,30),
nn.ReLU(),
nn.Linear(30,1)
)
learn = Learner(dls, simple_net, opt_func=SGD,
loss_func=mnist_loss, metrics=batch_accuracy)
learn.fit(40, 0.1)
plt.plot(L(learn.recorder.values).itemgot(2));
learn.recorder.values[-1][2]
dls = ImageDataLoaders.from_folder(path)
learn = cnn_learner(dls, resnet18, pretrained=False,
loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(1, 0.1)