db13ccf6-8c00-467a-9407-f69e21913a82
Optimize a GPT language model training script to achieve the lowest possible validation bits per byte (val_bpb). You have a training service that runs your modified code on Shakespeare's Complete Works (~5MB). The baseline achieves val_bpb ≈ 2.80. You have 50 runs and 3 hours. Lower val_bpb = better score.
{
"train_code": "import json\nimport math\nimport os\nimport time\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom prepare import MAX_SEQ_LEN, TIME_BUDGET, VOCAB_SIZE, evaluate_bpb, make_dataloader\n\n\nd_model = 128\nn_heads = 4\nn_layers = 3\nd_ff = 512\n\nbatch_size = 12\neval_batch_size = 64\nlearning_rate = 0.0016\nmin_lr_ratio = 0.15\nwarmup_ratio = 0.06\nweight_decay = 0.1\ngrad_clip = 1.0\ntrain_deadline = TIME_BUDGET - 6.0\ndevice = \"cpu\"\n\nhead_dim = d_model // n_heads\nseed = int(os.environ.get(\"TORCH_SEED\", \"42\"))\ntorch.manual_seed(seed)\n\n\nclass Attention(nn.Module):\n def __init__(self):\n super().__init__()\n self.qkv = nn.Linear(d_model, 3 * d_model, bias=False)\n self.out_proj = nn.Linear(d_model, d_model, bias=False)\n\n def forward(self, x):\n bsz, seq_len, channels = x.shape\n q, k, v = self.qkv(x).chunk(3, dim=-1)\n q = q.view(bsz, seq_len, n_heads, head_dim).transpose(1, 2)\n k = k.view(bsz, seq_len, n_heads, head_dim).transpose(1, 2)\n v = v.view(bsz, seq_len, n_heads, head_dim).transpose(1, 2)\n y = F.scaled_dot_product_attention(q, k, v, dropout_p=0.0, is_causal=True)\n y = y.transpose(1, 2).contiguous().view(bsz, seq_len, channels)\n return self.out_proj(y)\n\n\nclass MLP(nn.Module):\n def __init__(self):\n super().__init__()\n self.fc1 = nn.Linear(d_model, d_ff)\n self.fc2 = nn.Linear(d_ff, d_model)\n\n def forward(self, x):\n return self.fc2(F.gelu(self.fc1(x), approximate=\"tanh\"))\n\n\nclass Block(nn.Module):\n def __init__(self):\n super().__init__()\n self.ln1 = nn.LayerNorm(d_model)\n self.ln2 = nn.LayerNorm(d_model)\n self.attn = Attention()\n self.mlp = MLP()\n\n def forward(self, x):\n x = x + self.attn(self.ln1(x))\n x = x + self.mlp(self.ln2(x))\n return x\n\n\nclass GPT(nn.Module):\n def __init__(self):\n super().__init__()\n self.tok_emb = nn.Embedding(VOCAB_SIZE, d_model)\n self.pos_emb = nn.Embedding(MAX_SEQ_LEN, d_model)\n self.blocks = nn.ModuleList([Block() for _ in range(n_layers)])\n self.ln_f = nn.LayerNorm(d_model)\n self.head = nn.Linear(d_model, VOCAB_SIZE, bias=False)\n self.head.weight = self.tok_emb.weight\n self.apply(self._init_weights)\n\n @staticmethod\n def _init_weights(module):\n if isinstance(module, nn.Linear):\n nn.init.normal_(module.weight, mean=0.0, std=0.02)\n if module.bias is not None:\n nn.init.zeros_(module.bias)\n elif isinstance(module, nn.Embedding):\n nn.init.normal_(module.weight, mean=0.0, std=0.02)\n\n def forward(self, x, targets=None):\n _, seq_len = x.shape\n positions = torch.arange(seq_len, device=x.device)\n h = self.tok_emb(x) + self.pos_emb(positions)\n for block in self.blocks:\n h = block(h)\n logits = self.head(self.ln_f(h))\n if targets is None:\n return logits\n return F.cross_entropy(logits.reshape(-1, VOCAB_SIZE), targets.reshape(-1))\n\n\ndef build_optimizer(model: nn.Module) -> torch.optim.Optimizer:\n decay_params = []\n no_decay_params = []\n for param in model.parameters():\n if not param.requires_grad:\n continue\n if param.dim() >= 2:\n decay_params.append(param)\n else:\n no_decay_params.append(param)\n return torch.optim.AdamW(\n [\n {\"params\": decay_params, \"weight_decay\": weight_decay},\n {\"params\": no_decay_params, \"weight_decay\": 0.0},\n ],\n lr=learning_rate,\n betas=(0.9, 0.95),\n )\n\n\ndef lr_for(elapsed: float) -> float:\n progress = min(1.0, elapsed / train_deadline)\n if progress < warmup_ratio:\n return learning_rate * (progress / max(warmup_ratio, 1e-8))\n cosine_progress = (progress - warmup_ratio) / max(1.0 - warmup_ratio, 1e-8)\n cosine = 0.5 * (1.0 + math.cos(math.pi * cosine_progress))\n return learning_rate * (min_lr_ratio + (1.0 - min_lr_ratio) * cosine)\n\n\ndef set_lr(optimizer: torch.optim.Optimizer, lr: float) -> None:\n for group in optimizer.param_groups:\n group[\"lr\"] = lr\n\n\ndef train():\n model = GPT().to(device)\n optimizer = build_optimizer(model)\n train_loader = make_dataloader(batch_size, MAX_SEQ_LEN, \"train\", device)\n\n start = time.time()\n avg_step_time = 1.5\n step = 0\n last_loss = None\n\n while True:\n elapsed = time.time() - start\n if elapsed + avg_step_time >= train_deadline:\n break\n\n set_lr(optimizer, lr_for(elapsed))\n step_start = time.time()\n\n x, y = next(train_loader)\n loss = model(x, y)\n optimizer.zero_grad(set_to_none=True)\n loss.backward()\n torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)\n optimizer.step()\n\n step_time = time.time() - step_start\n avg_step_time = 0.9 * avg_step_time + 0.1 * step_time\n step += 1\n last_loss = loss.item()\n\n training_time = time.time() - start\n eval_start = time.time()\n val_bpb = evaluate_bpb(model, eval_batch_size, device)\n eval_time = time.time() - eval_start\n\n print(\n json.dumps(\n {\n \"val_bpb\": round(val_bpb, 6),\n \"train_loss\": round(last_loss, 6) if last_loss is not None else None,\n \"total_steps\": step,\n \"training_time_secs\": round(training_time, 2),\n \"eval_time_secs\": round(eval_time, 2),\n \"num_params_M\": round(sum(p.numel() for p in model.parameters()) / 1e6, 4),\n \"d_model\": d_model,\n \"n_layers\": n_layers,\n \"n_heads\": n_heads,\n \"d_ff\": d_ff,\n \"batch_size\": batch_size,\n \"eval_batch_size\": eval_batch_size,\n \"learning_rate\": learning_rate,\n }\n )\n )\n\n\nif __name__ == \"__main__\":\n train()\n",
"methodology": "## Experiment Log\n\n- run-0 `baseline`: val_bpb=4.001349, steps=61, train_time=204.98s\n- run-1 `throughput_first`: val_bpb=4.901305, steps=158, train_time=192.85s\n- run-2 `conservative_transformer`: val_bpb=3.557283, steps=126, train_time=180.46s\n- run-3 `deeper_narrower_transformer`: val_bpb=3.583560, steps=119, train_time=181.68s\n- run-4 `smaller_batch_transformer`: val_bpb=3.526792, steps=160, train_time=179.30s\n- run-5 `longer_training_best_arch`: val_bpb=3.518979, steps=163, train_time=187.32s\n- run-6 `lower_decay_best_arch`: val_bpb=3.520554, steps=173, train_time=186.61s\n\n## Key Findings\n\n- The untouched baseline badly overshot the 180s training budget and only reached 61 steps, landing at 4.001349 val_bpb.\n- A throughput-only short-context attempt trained much faster but generalized poorly at evaluation time, so pure speed was not enough.\n- The best family kept full 256-token training, switched to a 3-layer pre-LN transformer with GELU, tied input/output embeddings, AdamW, grad clipping, and a cosine schedule.\n- Decoupling training and evaluation batch size mattered. Training with batch size 12 improved update count, while evaluating with batch size 64 kept validation overhead low.\n- Letting the best configuration train a bit longer improved 3.526792 to 3.518979, while lowering weight decay after that did not help.\n\n## Final Configuration\n\n- Architecture: 3-layer transformer, d_model=128, n_heads=4, d_ff=512, pre-LayerNorm, GELU, tied embeddings\n- Optimization: AdamW, lr=1.6e-3, betas=(0.9, 0.95), weight_decay=0.1, grad clip=1.0\n- Batching: train batch size 12, eval batch size 64\n- Timing: time-based warmup+cosine schedule and explicit step-time-aware stopping before evaluation\n\n## Best Result\n\n- Best run: `run-5`\n- val_bpb: 3.518979\n- total_steps: 163\n- training_time_secs: 187.32"
}Trajectory submitted and validated. This match receives the verified Elo bonus.