Trading Full Context for Speed
Peripheral vision sacrifices detail for breadth. You can't read text in your peripheral vision, but you can detect motion. Evolution traded acuity for coverage. For most survival situations, seeing something moving at the edge of vision matters more than reading it clearly.
Sliding window attention makes the same trade. It sacrifices the ability to directly connect distant tokens for the ability to process much longer sequences. Each token only attends to nearby tokens. The quadratic scaling becomes linear. For many tasks, this trade is worth it.
The Scaling Problem
def attention_scaling():
return {
"full_attention": {
"operation": "Every token attends to every other token",
"complexity": "O(n²) for sequence length n",
"memory": "O(n²) to store attention matrix",
"at_4k_tokens": "16M operations",
"at_32k_tokens": "1B operations (64x more)",
"at_128k_tokens": "16B operations (1024x more)",
},
"sliding_window": {
"operation": "Each token attends to W nearby tokens",
"complexity": "O(n × W) for window size W",
"memory": "O(n × W)",
"at_4k_tokens_w4k": "16M operations",
"at_32k_tokens_w4k": "128M operations (8x more)",
"at_128k_tokens_w4k": "512M operations (32x more)",
},
"comparison": """
At 128K tokens:
- Full attention: 16B operations
- Sliding window (W=4K): 512M operations
32x fewer operations for same sequence length.
""",
}
How Sliding Window Works
def sliding_window_mechanism():
return {
"basic_idea": """
Token at position i only attends to tokens in range [i-W, i]
where W is the window size.
Position 1000 attends to positions 996-1000 (if W=4)
Position 50000 attends to positions 49996-50000
""",
"attention_pattern": """
For sequence [A, B, C, D, E, F, G] with W=3:
A attends to: [A]
B attends to: [A, B]
C attends to: [A, B, C]
D attends to: [B, C, D] # A now outside window
E attends to: [C, D, E]
F attends to: [D, E, F]
G attends to: [E, F, G]
""",
"implementation": """
def sliding_window_attention(Q, K, V, window_size):
seq_len = Q.shape[1]
output = torch.zeros_like(Q)
for i in range(seq_len):
# Define window
start = max(0, i - window_size + 1)
end = i + 1
# Attend only within window
q_i = Q[:, i:i+1]
k_window = K[:, start:end]
v_window = V[:, start:end]
scores = torch.matmul(q_i, k_window.transpose(-2, -1))
attn = torch.softmax(scores / sqrt(d), dim=-1)
output[:, i] = torch.matmul(attn, v_window)
return output
""",
}
What You Lose
def sliding_window_limitations():
return {
"direct_long_range": {
"loss": "Token at position 50000 can't directly attend to position 0",
"impact": "Long-range dependencies must propagate through layers",
"mitigation": "With L layers and window W, info travels L*W positions",
},
"global_context": {
"loss": "No single position 'sees' the whole sequence",
"impact": "Tasks requiring full-sequence understanding affected",
"mitigation": "Add global tokens or use hybrid approach",
},
"tasks_affected": {
"summarization": "May miss important details from document start",
"long_range_qa": "Answer at start, question at end = problem",
"complex_reasoning": "Multi-step reasoning across distant facts",
},
"tasks_unaffected": {
"local_generation": "Next token usually depends on recent context",
"streaming": "Only recent context matters",
"code_completion": "Usually local context sufficient",
},
}
Mistral's Approach
def mistral_sliding_window():
"""
How Mistral implements sliding window
"""
return {
"configuration": {
"window_size": 4096, # 4K token window
"num_layers": 32,
"effective_reach": "4096 * 32 = 131K positions (in theory)",
},
"rolling_buffer_cache": {
"idea": "KV cache only stores window_size entries",
"benefit": "Fixed memory regardless of sequence length",
"implementation": """
cache_size = window_size * batch_size * head_dim
# Instead of growing with sequence length
# Write to circular buffer
position = current_position % window_size
kv_cache[position] = current_kv
""",
},
"practical_results": {
"memory": "Fixed KV cache size (huge win for long sequences)",
"quality": "Competitive with full attention for most tasks",
"speed": "Linear scaling enables very long sequences",
},
}
Hybrid Approaches
def hybrid_attention():
return {
"global_local": {
"idea": "Most tokens use window, some global tokens see everything",
"example": "Longformer",
"implementation": """
Global tokens: [CLS], task tokens
Local tokens: Everything else
Global tokens: Full attention to all
Local tokens: Window attention + attend to globals
""",
"benefit": "Global context preserved with linear complexity",
},
"dilated_window": {
"idea": "Window with gaps for longer reach",
"example": "Attend to positions [i-1, i-2, i-4, i-8, ...]",
"benefit": "Logarithmic reach in window size",
},
"alternating_layers": {
"idea": "Some layers use full attention, most use window",
"example": "Every 4th layer is full attention",
"implementation": """
for i, layer in enumerate(layers):
if i % 4 == 0:
x = full_attention(x) # Global view
else:
x = sliding_attention(x) # Local, efficient
""",
"benefit": "Occasional global view, mostly efficient",
},
"sparse_patterns": {
"idea": "Fixed sparse patterns instead of window",
"example": "BigBird, Sparse Transformer",
"patterns": ["Local window", "Global tokens", "Random connections"],
},
}
Memory Benefits
def memory_comparison():
"""
The real advantage: fixed memory footprint
"""
return {
"full_attention_kv_cache": {
"formula": "2 * num_layers * seq_len * hidden_dim * dtype_size",
"at_32k_seq": "~16 GB for 70B model",
"at_128k_seq": "~64 GB for 70B model",
"problem": "Grows linearly with sequence length",
},
"sliding_window_kv_cache": {
"formula": "2 * num_layers * window_size * hidden_dim * dtype_size",
"at_32k_seq_4k_window": "~2 GB for 70B model",
"at_128k_seq_4k_window": "~2 GB for 70B model (same!)",
"benefit": "Fixed regardless of sequence length",
},
"implication": """
With sliding window:
- Can serve 8x more concurrent requests at 32K
- Can serve 32x more concurrent requests at 128K
- Memory planning becomes predictable
This is why Mistral can offer 128K context efficiently.
""",
}
When to Use Sliding Window
def sliding_window_decision():
return {
"use_sliding_window": [
"Very long sequences (> 32K tokens)",
"Memory-constrained environments",
"High-throughput batch processing",
"Tasks with primarily local dependencies",
"Real-time streaming applications",
],
"avoid_sliding_window": [
"Tasks requiring full-document understanding",
"Complex reasoning across distant facts",
"When quality is more important than efficiency",
"Short sequences (overhead not worth it)",
],
"hybrid_when": [
"Need long context AND global understanding",
"Can identify which tokens need global view",
"Willing to trade some efficiency for quality",
],
"evaluation_approach": """
1. Test full attention on your task (quality baseline)
2. Test sliding window at various sizes
3. Find smallest window that maintains quality
4. Measure memory/latency improvement
5. Decide if quality-efficiency trade is acceptable
""",
}
Practical Configuration
def configuration_guidance():
return {
"window_size_selection": {
"rule_of_thumb": "Larger than typical relevant context",
"code": "2-4K (functions, classes)",
"conversation": "4-8K (recent turns)",
"documents": "8-16K (sections, paragraphs)",
},
"quality_monitoring": {
"metrics": [
"Task-specific accuracy",
"Long-range dependency tests",
"User satisfaction (for interactive)",
],
"red_flag": "Quality drops on tasks requiring distant context",
},
"deployment_pattern": """
# Configure based on use case
if use_case == "code_completion":
window_size = 4096
expect_quality = "equivalent to full attention"
elif use_case == "long_document_qa":
window_size = 8192
use_hybrid = True # Add global tokens
elif use_case == "streaming_chat":
window_size = 4096
expect_quality = "good (recent context dominates)"
""",
}
Sliding window attention trades global visibility for efficiency. For tasks where nearby context dominates, this trade costs nothing. For tasks requiring full-sequence understanding, it costs quality. Know which case you're in before choosing.