Practice and reinforce the concepts from Lesson 14
In this activity, you'll build a Transformer-based text generation model from scratch. You'll implement self-attention, multi-head attention, positional encoding, and the complete GPT architecture. Then you'll fine-tune GPT-2 for custom text generation and explore advanced sampling strategies.
By completing this activity, you will:
Download the activity template from the Templates folder:
AI25-Template-activity-14-transformer-architectures.zipTemplates/AI25-Template-activity-14-transformer-architectures.zipactivity-14-transformer-architectures.ipynb to Google ColabExecute the first few cells to:
TODO 1: Implement scaled dot-product attention
class SelfAttention(nn.Module):
def __init__(self, d_model):
super().__init__()
self.d_model = d_model
# TODO 1: Define Q, K, V projection layers
self.query = nn.Linear(d_model, d_model)
self.key = nn.Linear(d_model, d_model)
self.value = nn.Linear(d_model, d_model)
def forward(self, x, mask=None):
"""
Scaled dot-product attention
Args:
x: Input (batch, seq_len, d_model)
mask: Attention mask (batch, seq_len, seq_len)
0 = masked position (set to -inf)
1 = valid position
Returns:
output: Attended values (batch, seq_len, d_model)
attention_weights: (batch, seq_len, seq_len)
"""
# TODO 1: Implement self-attention
# Step 1: Project to Q, K, V
# Q = self.query(x)
# K = self.key(x)
# V = self.value(x)
#
# Step 2: Compute attention scores
# scores = Q @ K^T / sqrt(d_model)
#
# Step 3: Apply mask (set masked positions to -inf)
# if mask is not None:
# scores = scores.masked_fill(mask == 0, -1e9)
#
# Step 4: Softmax
# attention_weights = softmax(scores, dim=-1)
#
# Step 5: Apply to values
# output = attention_weights @ V
# Your code here
pass
TODO 2: Implement multi-head attention
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super().__init__()
assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads
# TODO 2: Define projection layers for all heads
# Q, K, V projections (single matrix for all heads)
self.W_Q = nn.Linear(d_model, d_model)
self.W_K = nn.Linear(d_model, d_model)
self.W_V = nn.Linear(d_model, d_model)
# Output projection
self.W_O = nn.Linear(d_model, d_model)
def forward(self, x, mask=None):
"""
Multi-head attention
Args:
x: (batch, seq_len, d_model)
mask: (batch, seq_len, seq_len)
Returns:
output: (batch, seq_len, d_model)
"""
batch_size, seq_len, d_model = x.size()
# TODO 2: Implement multi-head attention
# Step 1: Project and split into heads
# Q = self.W_Q(x).view(batch, seq_len, num_heads, d_k).transpose(1, 2)
# K = self.W_K(x).view(batch, seq_len, num_heads, d_k).transpose(1, 2)
# V = self.W_V(x).view(batch, seq_len, num_heads, d_k).transpose(1, 2)
# Shape: (batch, num_heads, seq_len, d_k)
#
# Step 2: Compute attention for all heads in parallel
# scores = (Q @ K^T) / sqrt(d_k)
# Apply mask
# attention_weights = softmax(scores, dim=-1)
# attended = attention_weights @ V
#
# Step 3: Concatenate heads
# attended = attended.transpose(1, 2).contiguous()
# attended = attended.view(batch, seq_len, d_model)
#
# Step 4: Final projection
# output = self.W_O(attended)
# Your code here
pass
TODO 3: Implement sinusoidal positional encoding
class PositionalEncoding(nn.Module):
"""
Add positional information to embeddings
PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
"""
def __init__(self, d_model, max_len=5000):
super().__init__()
# TODO 3: Compute positional encoding matrix
# Shape: (max_len, d_model)
# For each position pos and dimension i:
# PE[pos, 2i] = sin(pos / 10000^(2i/d_model))
# PE[pos, 2i+1] = cos(pos / 10000^(2i/d_model))
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1).float()
div_term = torch.exp(torch.arange(0, d_model, 2).float() *
-(math.log(10000.0) / d_model))
# Your code here to compute PE
self.register_buffer('pe', pe.unsqueeze(0)) # (1, max_len, d_model)
def forward(self, x):
"""
Args:
x: Token embeddings (batch, seq_len, d_model)
Returns:
x + positional encoding
"""
# TODO 3: Add positional encoding
# return x + self.pe[:, :x.size(1)]
# Your code here
pass
TODO 4: Implement complete Transformer decoder block
class TransformerBlock(nn.Module):
"""
Transformer block: Attention → Add & Norm → FFN → Add & Norm
"""
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super().__init__()
# TODO 4: Define block components
# 1. Multi-head self-attention
self.attention = MultiHeadAttention(d_model, num_heads)
# 2. Feed-forward network (2-layer MLP)
self.ffn = nn.Sequential(
nn.Linear(d_model, d_ff),
nn.ReLU(),
nn.Linear(d_ff, d_model),
)
# 3. Layer normalization
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
# 4. Dropout
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask=None):
"""
Args:
x: (batch, seq_len, d_model)
mask: Causal mask (batch, seq_len, seq_len)
Returns:
output: (batch, seq_len, d_model)
"""
# TODO 4: Implement Transformer block
# Step 1: Self-attention with residual connection
# attn_output = self.attention(x, mask)
# x = self.norm1(x + self.dropout(attn_output))
#
# Step 2: Feed-forward with residual connection
# ffn_output = self.ffn(x)
# x = self.norm2(x + self.dropout(ffn_output))
# Your code here
pass
TODO 5: Assemble full GPT architecture
class GPT(nn.Module):
"""
GPT: Decoder-only Transformer for text generation
"""
def __init__(self, vocab_size, d_model=512, num_heads=8, num_layers=6,
d_ff=2048, max_len=512, dropout=0.1):
super().__init__()
self.d_model = d_model
self.max_len = max_len
# TODO 5: Define GPT components
# 1. Token embedding
self.token_embedding = nn.Embedding(vocab_size, d_model)
# 2. Positional encoding
self.positional_encoding = PositionalEncoding(d_model, max_len)
# 3. Transformer blocks
self.blocks = nn.ModuleList([
TransformerBlock(d_model, num_heads, d_ff, dropout)
for _ in range(num_layers)
])
# 4. Final layer norm
self.ln_f = nn.LayerNorm(d_model)
# 5. Output projection (logits over vocabulary)
self.head = nn.Linear(d_model, vocab_size, bias=False)
# Initialize weights
self.apply(self._init_weights)
def _init_weights(self, module):
if isinstance(module, nn.Linear):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
if module.bias is not None:
torch.nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
def forward(self, idx, targets=None):
"""
Args:
idx: Token indices (batch, seq_len)
targets: Target tokens for training (batch, seq_len)
Returns:
logits: (batch, seq_len, vocab_size)
loss: Cross-entropy loss (if targets provided)
"""
batch_size, seq_len = idx.size()
# TODO 5: Implement GPT forward pass
# Step 1: Token embeddings + positional encoding
# token_emb = self.token_embedding(idx) * sqrt(d_model)
# x = self.positional_encoding(token_emb)
#
# Step 2: Create causal mask (lower triangular)
# mask = torch.tril(torch.ones(seq_len, seq_len))
#
# Step 3: Apply Transformer blocks
# for block in self.blocks:
# x = block(x, mask)
#
# Step 4: Final layer norm and projection
# x = self.ln_f(x)
# logits = self.head(x)
#
# Step 5: Compute loss if targets provided
# if targets is not None:
# loss = F.cross_entropy(logits.view(-1, vocab_size), targets.view(-1))
# Your code here
pass
@torch.no_grad()
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
"""
Generate text autoregressively
Args:
idx: Starting context (batch, seq_len)
max_new_tokens: Number of tokens to generate
temperature: Sampling temperature
top_k: Top-k sampling (None = disabled)
Returns:
Generated sequence (batch, seq_len + max_new_tokens)
"""
# TODO 5: Implement text generation
# For each new token:
# 1. Crop context to max_len if needed
# 2. Forward pass to get logits
# 3. Get logits for last token
# 4. Apply temperature scaling
# 5. Optional: Top-k filtering
# 6. Sample from distribution
# 7. Append to sequence
# Your code here
pass
Pre-built training loop with:
TODO 6: Implement various sampling strategies
def greedy_sampling(logits):
"""
Always pick most probable token
Args:
logits: (batch, vocab_size)
Returns:
next_token: (batch,)
"""
# TODO 6a: Implement greedy sampling
# return torch.argmax(logits, dim=-1)
# Your code here
pass
def temperature_sampling(logits, temperature=1.0):
"""
Sample with temperature scaling
Args:
logits: (batch, vocab_size)
temperature: Higher = more random, Lower = more confident
Returns:
next_token: (batch,)
"""
# TODO 6b: Implement temperature sampling
# Step 1: Scale logits by temperature
# Step 2: Compute probabilities with softmax
# Step 3: Sample from distribution
# Your code here
pass
def top_k_sampling(logits, k=50):
"""
Sample from top-k most likely tokens
Args:
logits: (batch, vocab_size)
k: Number of top tokens to consider
Returns:
next_token: (batch,)
"""
# TODO 6c: Implement top-k sampling
# Step 1: Get top-k logits and indices
# Step 2: Set non-top-k logits to -inf
# Step 3: Compute probabilities and sample
# Your code here
pass
def nucleus_sampling(logits, p=0.9):
"""
Sample from smallest set with cumulative probability ≥ p (top-p)
Args:
logits: (batch, vocab_size)
p: Cumulative probability threshold
Returns:
next_token: (batch,)
"""
# TODO 6d: Implement nucleus (top-p) sampling
# Step 1: Sort logits in descending order
# Step 2: Compute cumulative probabilities
# Step 3: Find cutoff (where cumulative prob > p)
# Step 4: Mask tokens beyond cutoff
# Step 5: Sample from remaining distribution
# Your code here
pass
TODO 7: Fine-tune pre-trained GPT-2 on custom corpus
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
def fine_tune_gpt2(train_texts, output_dir="./fine-tuned-gpt2"):
"""
Fine-tune GPT-2 on custom text corpus
Args:
train_texts: List of text strings
output_dir: Where to save fine-tuned model
Returns:
Fine-tuned model and tokenizer
"""
# TODO 7: Implement GPT-2 fine-tuning
# Step 1: Load pre-trained GPT-2
# model = GPT2LMHeadModel.from_pretrained('gpt2')
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
#
# Step 2: Tokenize training data
# train_encodings = tokenizer(train_texts, ...)
#
# Step 3: Create dataset
# train_dataset = CustomDataset(train_encodings)
#
# Step 4: Set up training arguments
# training_args = TrainingArguments(...)
#
# Step 5: Train
# trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset)
# trainer.train()
# Your code here
pass
Training Progress (TinyShakespeare, 10 epochs):
Epoch 1: Train Loss = 3.45 | Val Loss = 3.28
Epoch 5: Train Loss = 1.82 | Val Loss = 1.95
Epoch 10: Train Loss = 1.34 | Val Loss = 1.58
✓ Loss decreases smoothly
✓ Model learns language structure
Generated Text (after training):
Prompt: "To be or not to be"
Output: "To be or not to be, that is the question:
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles"
✓ Grammatically correct
✓ Shakespeare-style language
✓ Coherent continuation
Prompt: "Once upon a time"
Greedy (deterministic):
"Once upon a time there was a king who lived in a castle..."
✓ Safe, coherent
✗ Repetitive, boring
Temperature = 0.7:
"Once upon a time in a small village, a curious girl discovered a magical book..."
✓ Creative, diverse
✓ Still coherent
Top-k (k=50):
"Once upon a time, deep in the enchanted forest, an ancient tree whispered secrets..."
✓ Unexpected words
✓ Maintains quality
Nucleus (p=0.9):
"Once upon a time, when dragons still roamed the skies, a brave knight embarked..."
✓ Best balance of creativity and coherence
✓ Most human-like
Custom Corpus: Python Code
Before Fine-Tuning:
Prompt: "def fibonacci(n):"
Output: "I don't know what you mean by that..."
✗ Not code-aware
After Fine-Tuning (on Python code):
Prompt: "def fibonacci(n):"
Output: "def fibonacci(n):
if n <= 1:
return n
return fibonacci(n-1) + fibonacci(n-2)"
✓ Valid Python code!
✓ Correct fibonacci implementation
Your implementation is complete when:
Causal (autoregressive) mask:
# Lower triangular matrix (can only attend to past)
mask = torch.tril(torch.ones(seq_len, seq_len))
# [[1, 0, 0],
# [1, 1, 0],
# [1, 1, 1]]
# In attention scores:
scores = scores.masked_fill(mask == 0, -1e9) # -inf
Without masking: Model cheats by seeing future tokens!
One. Learning Rate Schedule:
2. Gradient Clipping:
3. Batch Size:
| Use Case | Strategy | Parameters |
|---|---|---|
| Deterministic | Greedy | N/A |
| Slightly random | Temperature | T = 0.7-0.8 |
| Creative writing | Nucleus | p = 0.9 |
| Code generation | Top-k + temp | k = 50, T = 0.2 |
| Diverse ideas | Temperature | T = 1.2-1.5 |
Implement beam search decoding:
def beam_search(model, prompt, beam_width=5, max_len=50):
"""
Generate text using beam search
Maintains top-k hypotheses at each step
"""
pass
Benefit: Better quality than greedy, less random than sampling
Implement bidirectional Transformer (encoder-only):
class BERT(nn.Module):
"""
Bidirectional encoder (no causal masking)
Use case: Classification, embeddings, masked language modeling
"""
pass
Full seq2seq Transformer:
class EncoderDecoder(nn.Module):
"""
Encoder: Bidirectional attention
Decoder: Causal attention + cross-attention to encoder
"""
pass
Use case: Translation, summarization
Optimize attention for memory efficiency:
def flash_attention(Q, K, V):
"""
Compute attention without materializing full attention matrix
Memory: O(N) instead of O(N^2)
"""
pass
Completed Notebook: activity-14-transformer-architectures.ipynb
Generated Text Samples:
Fine-Tuned GPT-2:
Attention Visualization:
Analysis (7-10 sentences):
Next Activity: Activity 15 - Large Language Models (LLMs) Fundamentals
This activity is graded on:
Passing Grade: 70% or higher
Congratulations on mastering Transformer architectures! 🎉🤖