Backpropagation: Building Autograd

PyTorch's autograd is magic until you build it yourself. Let's build it.

The Goal

Create a tensor class that:

Tracks its computation history
Knows how to compute its own gradient
Propagates gradients to inputs

# Our goal:
x = Tensor([2.0], requires_grad=True)
y = Tensor([3.0], requires_grad=True)
z = x * y
z.backward()
print(x.grad)  # Should be 3.0 (∂z/∂x = y)

The Tensor Class

import numpy as np
from typing import Optional, Callable, List

class Tensor:
    def __init__(
        self,
        data: np.ndarray,
        requires_grad: bool = False,
        grad_fn: Optional['BackwardFunction'] = None,
    ):
        self.data = np.array(data, dtype=np.float64)
        self.requires_grad = requires_grad
        self.grad_fn = grad_fn
        self.grad: Optional[np.ndarray] = None

    def backward(self, grad: Optional[np.ndarray] = None):
        if grad is None:
            grad = np.ones_like(self.data)

        if self.grad is None:
            self.grad = grad
        else:
            self.grad = self.grad + grad  # Accumulate

        if self.grad_fn is not None:
            self.grad_fn.backward(grad)

The BackwardFunction Base

class BackwardFunction:
    def __init__(self, *tensors: Tensor):
        self.inputs = tensors

    def backward(self, grad_output: np.ndarray):
        """Compute gradients for each input and propagate."""
        raise NotImplementedError

Each operation will subclass this and implement backward.

Implementing Addition

Forward: $c = a + b$ Backward: $\frac{\partial c}{\partial a} = 1$, $\frac{\partial c}{\partial b} = 1$

class AddBackward(BackwardFunction):
    def backward(self, grad_output: np.ndarray):
        a, b = self.inputs

        if a.requires_grad:
            a.backward(grad_output * 1)  # ∂c/∂a = 1

        if b.requires_grad:
            b.backward(grad_output * 1)  # ∂c/∂b = 1

def add(a: Tensor, b: Tensor) -> Tensor:
    requires_grad = a.requires_grad or b.requires_grad
    grad_fn = AddBackward(a, b) if requires_grad else None

    return Tensor(
        data=a.data + b.data,
        requires_grad=requires_grad,
        grad_fn=grad_fn,
    )

# Add as method
Tensor.__add__ = add

Implementing Multiplication

Forward: $c = a \times b$ Backward: $\frac{\partial c}{\partial a} = b$, $\frac{\partial c}{\partial b} = a$

class MulBackward(BackwardFunction):
    def backward(self, grad_output: np.ndarray):
        a, b = self.inputs

        if a.requires_grad:
            a.backward(grad_output * b.data)  # ∂c/∂a = b

        if b.requires_grad:
            b.backward(grad_output * a.data)  # ∂c/∂b = a

def mul(a: Tensor, b: Tensor) -> Tensor:
    requires_grad = a.requires_grad or b.requires_grad
    grad_fn = MulBackward(a, b) if requires_grad else None

    return Tensor(
        data=a.data * b.data,
        requires_grad=requires_grad,
        grad_fn=grad_fn,
    )

Tensor.__mul__ = mul

Implementing More Operations

Power:

class PowBackward(BackwardFunction):
    def __init__(self, tensor: Tensor, exponent: float):
        super().__init__(tensor)
        self.exponent = exponent

    def backward(self, grad_output: np.ndarray):
        a = self.inputs[0]
        if a.requires_grad:
            # ∂(a^n)/∂a = n * a^(n-1)
            local_grad = self.exponent * (a.data ** (self.exponent - 1))
            a.backward(grad_output * local_grad)

ReLU:

class ReLUBackward(BackwardFunction):
    def backward(self, grad_output: np.ndarray):
        a = self.inputs[0]
        if a.requires_grad:
            # ∂relu(a)/∂a = 1 if a > 0 else 0
            local_grad = (a.data > 0).astype(np.float64)
            a.backward(grad_output * local_grad)

Testing Our Autograd

# Test against PyTorch
import torch

# Our implementation
x1 = Tensor([2.0], requires_grad=True)
y1 = Tensor([3.0], requires_grad=True)
z1 = x1 * y1 + x1
z1.backward()

# PyTorch
x2 = torch.tensor([2.0], requires_grad=True)
y2 = torch.tensor([3.0], requires_grad=True)
z2 = x2 * y2 + x2
z2.backward()

print(f"Our x.grad: {x1.grad}")        # [4.0]
print(f"PyTorch x.grad: {x2.grad}")    # tensor([4.])
assert np.allclose(x1.grad, x2.grad.numpy())

The Linear Layer

Now we can build neural network components:

class Linear:
    def __init__(self, in_features: int, out_features: int):
        # Initialize weights
        k = 1 / np.sqrt(in_features)
        self.weight = Tensor(
            np.random.uniform(-k, k, (out_features, in_features)),
            requires_grad=True
        )
        self.bias = Tensor(
            np.random.uniform(-k, k, (out_features,)),
            requires_grad=True
        )

    def __call__(self, x: Tensor) -> Tensor:
        # y = x @ W^T + b
        return matmul(x, transpose(self.weight)) + self.bias

Training a Network

# Simple network: input → linear → relu → linear → output
layer1 = Linear(784, 128)
layer2 = Linear(128, 10)

def forward(x):
    h = relu(layer1(x))
    return layer2(h)

# Training step
x = Tensor(np.random.randn(32, 784))  # Batch of images
y_true = np.random.randint(0, 10, 32)  # Labels

# Forward
logits = forward(x)
loss = cross_entropy(logits, y_true)

# Backward
loss.backward()

# Update (SGD)
lr = 0.01
for layer in [layer1, layer2]:
    layer.weight.data -= lr * layer.weight.grad
    layer.bias.data -= lr * layer.bias.grad
    layer.weight.grad = None
    layer.bias.grad = None

Capstone Connection

Why build autograd from scratch?

Demystification: Backprop is just the chain rule applied systematically
Debugging: When gradients are wrong, you'll know where to look
Research: Custom operations need custom backward functions

When analyzing sycophancy:

You might implement custom gradient-based attribution methods
You need to understand how gradients flow through attention

🎓 Tyla's Exercise

Implement SigmoidBackward. The derivative of sigmoid is $\sigma(x)(1 - \sigma(x))$.
Why do we accumulate gradients instead of replacing them? Give an example where a tensor appears multiple times in the graph.
Implement MatMulBackward. Hint: $\frac{\partial (AB)}{\partial A} = B^T$ and $\frac{\partial (AB)}{\partial B} = A^T$ (with appropriate dimension handling).

💻 Aaliyah's Exercise

Build and train a complete MLP:

class MLP:
    def __init__(self, dims: List[int]):
        """
        dims: [input_dim, hidden1, hidden2, ..., output_dim]
        """
        self.layers = []
        for i in range(len(dims) - 1):
            self.layers.append(Linear(dims[i], dims[i+1]))

    def __call__(self, x: Tensor) -> Tensor:
        for i, layer in enumerate(self.layers[:-1]):
            x = relu(layer(x))
        return self.layers[-1](x)  # No activation on output

    def parameters(self) -> List[Tensor]:
        params = []
        for layer in self.layers:
            params.extend([layer.weight, layer.bias])
        return params

# Train on MNIST
# Target: >90% accuracy using YOUR autograd

📚 Maneesha's Reflection

Building autograd from scratch is a classic exercise in ML education. What does it teach that reading about it doesn't?
The pattern of "forward builds graph, backward traverses it" appears in many areas (compilers, data lineage, etc.). What's the common abstraction?
Modern deep learning frameworks handle thousands of operations. How do you think they organize and test all those backward functions?

Backpropagation: Building Autograd #

The Goal #

The Tensor Class #

The BackwardFunction Base #

Implementing Addition #

Implementing Multiplication #

Implementing More Operations #

Testing Our Autograd #

The Linear Layer #

Training a Network #

Capstone Connection #

🎓 Tyla's Exercise #

💻 Aaliyah's Exercise #

📚 Maneesha's Reflection #