Backpropagation: Building Autograd
PyTorch's autograd is magic until you build it yourself. Let's build it.
The Goal
Create a tensor class that:
- Tracks its computation history
- Knows how to compute its own gradient
- Propagates gradients to inputs
# Our goal:
x = Tensor([2.0], requires_grad=True)
y = Tensor([3.0], requires_grad=True)
z = x * y
z.backward()
print(x.grad) # Should be 3.0 (∂z/∂x = y)
The Tensor Class
import numpy as np
from typing import Optional, Callable, List
class Tensor:
def __init__(
self,
data: np.ndarray,
requires_grad: bool = False,
grad_fn: Optional['BackwardFunction'] = None,
):
self.data = np.array(data, dtype=np.float64)
self.requires_grad = requires_grad
self.grad_fn = grad_fn
self.grad: Optional[np.ndarray] = None
def backward(self, grad: Optional[np.ndarray] = None):
if grad is None:
grad = np.ones_like(self.data)
if self.grad is None:
self.grad = grad
else:
self.grad = self.grad + grad # Accumulate
if self.grad_fn is not None:
self.grad_fn.backward(grad)
The BackwardFunction Base
class BackwardFunction:
def __init__(self, *tensors: Tensor):
self.inputs = tensors
def backward(self, grad_output: np.ndarray):
"""Compute gradients for each input and propagate."""
raise NotImplementedError
Each operation will subclass this and implement backward.
Implementing Addition
Forward: $c = a + b$ Backward: $\frac{\partial c}{\partial a} = 1$, $\frac{\partial c}{\partial b} = 1$
class AddBackward(BackwardFunction):
def backward(self, grad_output: np.ndarray):
a, b = self.inputs
if a.requires_grad:
a.backward(grad_output * 1) # ∂c/∂a = 1
if b.requires_grad:
b.backward(grad_output * 1) # ∂c/∂b = 1
def add(a: Tensor, b: Tensor) -> Tensor:
requires_grad = a.requires_grad or b.requires_grad
grad_fn = AddBackward(a, b) if requires_grad else None
return Tensor(
data=a.data + b.data,
requires_grad=requires_grad,
grad_fn=grad_fn,
)
# Add as method
Tensor.__add__ = add
Implementing Multiplication
Forward: $c = a \times b$ Backward: $\frac{\partial c}{\partial a} = b$, $\frac{\partial c}{\partial b} = a$
class MulBackward(BackwardFunction):
def backward(self, grad_output: np.ndarray):
a, b = self.inputs
if a.requires_grad:
a.backward(grad_output * b.data) # ∂c/∂a = b
if b.requires_grad:
b.backward(grad_output * a.data) # ∂c/∂b = a
def mul(a: Tensor, b: Tensor) -> Tensor:
requires_grad = a.requires_grad or b.requires_grad
grad_fn = MulBackward(a, b) if requires_grad else None
return Tensor(
data=a.data * b.data,
requires_grad=requires_grad,
grad_fn=grad_fn,
)
Tensor.__mul__ = mul
Implementing More Operations
Power:
class PowBackward(BackwardFunction):
def __init__(self, tensor: Tensor, exponent: float):
super().__init__(tensor)
self.exponent = exponent
def backward(self, grad_output: np.ndarray):
a = self.inputs[0]
if a.requires_grad:
# ∂(a^n)/∂a = n * a^(n-1)
local_grad = self.exponent * (a.data ** (self.exponent - 1))
a.backward(grad_output * local_grad)
ReLU:
class ReLUBackward(BackwardFunction):
def backward(self, grad_output: np.ndarray):
a = self.inputs[0]
if a.requires_grad:
# ∂relu(a)/∂a = 1 if a > 0 else 0
local_grad = (a.data > 0).astype(np.float64)
a.backward(grad_output * local_grad)
Testing Our Autograd
# Test against PyTorch
import torch
# Our implementation
x1 = Tensor([2.0], requires_grad=True)
y1 = Tensor([3.0], requires_grad=True)
z1 = x1 * y1 + x1
z1.backward()
# PyTorch
x2 = torch.tensor([2.0], requires_grad=True)
y2 = torch.tensor([3.0], requires_grad=True)
z2 = x2 * y2 + x2
z2.backward()
print(f"Our x.grad: {x1.grad}") # [4.0]
print(f"PyTorch x.grad: {x2.grad}") # tensor([4.])
assert np.allclose(x1.grad, x2.grad.numpy())
The Linear Layer
Now we can build neural network components:
class Linear:
def __init__(self, in_features: int, out_features: int):
# Initialize weights
k = 1 / np.sqrt(in_features)
self.weight = Tensor(
np.random.uniform(-k, k, (out_features, in_features)),
requires_grad=True
)
self.bias = Tensor(
np.random.uniform(-k, k, (out_features,)),
requires_grad=True
)
def __call__(self, x: Tensor) -> Tensor:
# y = x @ W^T + b
return matmul(x, transpose(self.weight)) + self.bias
Training a Network
# Simple network: input → linear → relu → linear → output
layer1 = Linear(784, 128)
layer2 = Linear(128, 10)
def forward(x):
h = relu(layer1(x))
return layer2(h)
# Training step
x = Tensor(np.random.randn(32, 784)) # Batch of images
y_true = np.random.randint(0, 10, 32) # Labels
# Forward
logits = forward(x)
loss = cross_entropy(logits, y_true)
# Backward
loss.backward()
# Update (SGD)
lr = 0.01
for layer in [layer1, layer2]:
layer.weight.data -= lr * layer.weight.grad
layer.bias.data -= lr * layer.bias.grad
layer.weight.grad = None
layer.bias.grad = None
Capstone Connection
Why build autograd from scratch?
- Demystification: Backprop is just the chain rule applied systematically
- Debugging: When gradients are wrong, you'll know where to look
- Research: Custom operations need custom backward functions
When analyzing sycophancy:
- You might implement custom gradient-based attribution methods
- You need to understand how gradients flow through attention
🎓 Tyla's Exercise
Implement
SigmoidBackward. The derivative of sigmoid is $\sigma(x)(1 - \sigma(x))$.Why do we accumulate gradients instead of replacing them? Give an example where a tensor appears multiple times in the graph.
Implement
MatMulBackward. Hint: $\frac{\partial (AB)}{\partial A} = B^T$ and $\frac{\partial (AB)}{\partial B} = A^T$ (with appropriate dimension handling).
💻 Aaliyah's Exercise
Build and train a complete MLP:
class MLP:
def __init__(self, dims: List[int]):
"""
dims: [input_dim, hidden1, hidden2, ..., output_dim]
"""
self.layers = []
for i in range(len(dims) - 1):
self.layers.append(Linear(dims[i], dims[i+1]))
def __call__(self, x: Tensor) -> Tensor:
for i, layer in enumerate(self.layers[:-1]):
x = relu(layer(x))
return self.layers[-1](x) # No activation on output
def parameters(self) -> List[Tensor]:
params = []
for layer in self.layers:
params.extend([layer.weight, layer.bias])
return params
# Train on MNIST
# Target: >90% accuracy using YOUR autograd
📚 Maneesha's Reflection
Building autograd from scratch is a classic exercise in ML education. What does it teach that reading about it doesn't?
The pattern of "forward builds graph, backward traverses it" appears in many areas (compilers, data lineage, etc.). What's the common abstraction?
Modern deep learning frameworks handle thousands of operations. How do you think they organize and test all those backward functions?