From af990122d1a8a03c0ab756329a99dfd6a08f2f5c Mon Sep 17 00:00:00 2001 From: Sentinel Research Date: Wed, 24 Jun 2026 07:05:37 +0200 Subject: [PATCH] Hito 1: scaffold del paper (estructura, LaTeX revtex4-2, CI, licencias duales) --- .github/workflows/noise-harness.yml | 31 ++++++++++++++ .gitignore | 29 ++++++++++++++ AUTHORS.md | 8 ++++ README.md | 40 ++++++++++++++++++- data/README.md | 30 ++++++++++++++ experiments/README.md | 27 +++++++++++++ paper/main.tex | 39 ++++++++++++++++++ paper/references.bib | 7 ++++ paper/sections/01_introduction.tex | 15 +++++++ paper/sections/02_related_work.tex | 17 ++++++++ paper/sections/03_problem_formalization.tex | 18 +++++++++ paper/sections/04_the_lookahead_bug.tex | 17 ++++++++ .../sections/05_noise_harness_methodology.tex | 20 ++++++++++ paper/sections/06_experimental_setup.tex | 23 +++++++++++ paper/sections/07_results.tex | 19 +++++++++ .../sections/08_discussion_and_conclusion.tex | 22 ++++++++++ results/.gitkeep | 0 17 files changed, 361 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/noise-harness.yml create mode 100644 .gitignore create mode 100644 AUTHORS.md create mode 100644 data/README.md create mode 100644 experiments/README.md create mode 100644 paper/main.tex create mode 100644 paper/references.bib create mode 100644 paper/sections/01_introduction.tex create mode 100644 paper/sections/02_related_work.tex create mode 100644 paper/sections/03_problem_formalization.tex create mode 100644 paper/sections/04_the_lookahead_bug.tex create mode 100644 paper/sections/05_noise_harness_methodology.tex create mode 100644 paper/sections/06_experimental_setup.tex create mode 100644 paper/sections/07_results.tex create mode 100644 paper/sections/08_discussion_and_conclusion.tex create mode 100644 results/.gitkeep diff --git a/.github/workflows/noise-harness.yml b/.github/workflows/noise-harness.yml new file mode 100644 index 0000000..244f61c --- /dev/null +++ b/.github/workflows/noise-harness.yml @@ -0,0 +1,31 @@ +name: noise-harness + +on: + pull_request: + push: + branches: [main] + +jobs: + run-noise-harness: + runs-on: docker + container: + image: node:20-bookworm + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + run: | + apt-get update -qq && apt-get install -y -qq python3 python3-pip python3-venv + python3 -m venv .venv + . .venv/bin/activate + pip install -q -r experiments/requirements.txt + if: hashFiles('experiments/requirements.txt') != '' + + - name: Run noise harness + run: | + if [ -f experiments/05_noise_harness.py ]; then + . .venv/bin/activate + python3 experiments/05_noise_harness.py + else + echo "noise harness script not present yet — placeholder pass" + fi diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d14710e --- /dev/null +++ b/.gitignore @@ -0,0 +1,29 @@ +# Python +__pycache__/ +*.pyc +.venv/ +venv/ +*.egg-info/ + +# LaTeX +*.aux +*.bbl +*.bbl-SAVE-ERROR +*.blg +*.fdb_latexmk +*.fls +*.log +*.out +*.synctex.gz +*.toc +*.run.xml +paper/main.pdf + +# Large data — never commit raw market data +data/*.parquet +data/*.csv +data/raw/ + +# Local experiment scratch +results/raw/local-* +.DS_Store diff --git a/AUTHORS.md b/AUTHORS.md new file mode 100644 index 0000000..db30c4e --- /dev/null +++ b/AUTHORS.md @@ -0,0 +1,8 @@ +# Authors + +**Sentinel Research** — institutional author. + +Corresponding author: Daniel Cruces (danielcruces71@gmail.com) + +For questions about reproduction, data, or the audit trail, open an issue +on this repository or contact the corresponding author directly. diff --git a/README.md b/README.md index 55db8d3..5f8e723 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,40 @@ -# lookahead-bias-paper +# Lookahead Bias in Vectorized Backtesting: A Noise Harness Diagnostic +Repository for the paper documenting a lookahead-bias defect found in a +vectorized backtester (the "K12" kernel), and a noise-harness methodology +to detect this class of bug using pure Geometric Brownian Motion data. + +## Structure + +``` +paper/ LaTeX source (revtex4-2) +experiments/ Description and specs of the 5 planned experiments +data/ Instructions to obtain the BTCUSDT 1m dataset (no large binaries in git) +results/ Raw experiment outputs (json/csv), git-tracked once produced +audit/input/ Forensic copy of the original K12 code/data/results for reproduction +``` + +## How to reproduce + +1. Read `experiments/README.md` for the experiment list and what each one tests. +2. Read `data/README.md` to obtain the dataset (or regenerate synthetic GBM data). +3. Run the scripts under `experiments/` (CI runs the noise harness automatically + on every PR, see `.github/workflows/noise-harness.yml`). +4. Compare your output against the reference files in `results/`. + +## License — read this before reusing anything + +This repository carries **two separate licenses** for two separate kinds of content: + +| Content | License | File | +|---|---|---| +| Code: experiment scripts, harness, CI workflows, anything under `experiments/`, `data/`, `.github/` | **MIT** | [`LICENSE`](LICENSE) | +| Paper text and figures, anything under `paper/` | **CC-BY 4.0** | [`LICENSE-TEXT-CC-BY-4.0.md`](LICENSE-TEXT-CC-BY-4.0.md) | + +Use the code freely, commercially or not, with attribution (MIT terms). +Reuse the paper text/figures freely, commercially or not, with attribution (CC-BY 4.0 terms). +These are independent grants — reusing the code does not require complying with CC-BY, and vice versa. + +## Authorship + +See [`AUTHORS.md`](AUTHORS.md). diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..fdfe869 --- /dev/null +++ b/data/README.md @@ -0,0 +1,30 @@ +# Data + +This repository does not track raw market data (see `.gitignore`). Large +binaries don't belong in a public git repo, and Binance's public API makes +the data trivially reconstructible. + +## BTCUSDT 1-minute OHLCV + +Used in experiments 2 and 3 (baseline and honest replication against real +data). To obtain it: + +1. If `download_data.py` exists in this directory, run it — it pulls the + exact date range used in the original experiment from the public Binance + API and writes `BTCUSDT_1m.parquet`. +2. Verify the SHA-256 hash of the resulting file matches the one recorded in + `audit/input/MANIFEST.md` (forensic record of the original dataset used + when the bug was found). + +```bash +sha256sum BTCUSDT_1m.parquet +``` + +If the hash doesn't match, the date range or Binance API response has +drifted — do not proceed with replication until it's reconciled. + +## Synthetic GBM data (noise harness) + +Generated on the fly by `experiments/01_generate_gbm.py`. No download +needed — this is the point of using synthetic null data: it requires zero +external dependency and is perfectly reproducible from a seed. diff --git a/experiments/README.md b/experiments/README.md new file mode 100644 index 0000000..8d5e9b0 --- /dev/null +++ b/experiments/README.md @@ -0,0 +1,27 @@ +# Experiments + +Five experiments, run in order. Each script is `0N_.py`. Scripts that +don't exist yet are listed here as a spec so the CI workflow and the paper's +Section 6 stay in sync with what's actually implemented. + +| # | Script | Purpose | Status | +|---|--------|---------|--------| +| 1 | `01_generate_gbm.py` | Generate pure-noise GBM price series (fixed seed, documented params) | pending | +| 2 | `02_baseline_replication.py` | Run K12 golden hyperparameters on real BTCUSDT 1m, buggy backtester → expect Sharpe ≈ 14.49 | pending — needs `audit/input/code` | +| 3 | `03_honest_replication.py` | Same hyperparameters/data, `time_machine.py` engine → expect Sharpe ≈ -0.25 | pending — needs `audit/input/code` | +| 4 | `04_noise_control.py` | Run both engines across ≥30 independent GBM seeds, compare Sharpe distributions | pending | +| 5 | `05_noise_harness.py` | CI-gating version of experiment 4: fails the build if mean Sharpe on noise falls outside a pre-registered null band | pending | + +## Reproducibility rules + +- Every script must take `--seed` and print it in its output. +- Every output JSON must include: seed, kernel version/hash, library versions + (numpy/pandas), and a UTC timestamp. +- No script reads from `audit/input/` directly in a way that would couple the + public reproduction path to the forensic copy — `audit/input/` is for our + own verification, not for the published reproduction instructions. + +## Environment + +Pin dependencies in `requirements.txt` (to be added alongside the first +script). CI installs from that file — see `.github/workflows/noise-harness.yml`. diff --git a/paper/main.tex b/paper/main.tex new file mode 100644 index 0000000..292f9d4 --- /dev/null +++ b/paper/main.tex @@ -0,0 +1,39 @@ +\documentclass[aps,onecolumn,nofootinbib,floatfix]{revtex4-2} + +\usepackage{graphicx} +\usepackage{amsmath} +\usepackage{amssymb} +\usepackage{hyperref} +\usepackage{booktabs} + +\begin{document} + +\title{Lookahead Bias in Vectorized Backtesting: A Noise Harness Diagnostic} + +\author{Sentinel Research} +\affiliation{Sentinel Research} +\email{danielcruces71@gmail.com} + +\date{\today} + +\begin{abstract} +% TODO: 150-250 words. Must state: (1) the defect found (lookahead bias in a +% vectorized backtester), (2) the diagnostic method (pure-noise GBM harness), +% (3) the headline numbers (Sharpe 14.49 under the bug vs Sharpe -0.25 fixed), +% (4) why this matters for anyone running vectorized backtests at scale. +\end{abstract} + +\maketitle + +\input{sections/01_introduction} +\input{sections/02_related_work} +\input{sections/03_problem_formalization} +\input{sections/04_the_lookahead_bug} +\input{sections/05_noise_harness_methodology} +\input{sections/06_experimental_setup} +\input{sections/07_results} +\input{sections/08_discussion_and_conclusion} + +\bibliography{references} + +\end{document} diff --git a/paper/references.bib b/paper/references.bib new file mode 100644 index 0000000..63e43c4 --- /dev/null +++ b/paper/references.bib @@ -0,0 +1,7 @@ +% Bibliography for "Lookahead Bias in Vectorized Backtesting" +% Populate in Hito 2. Suggested starting points to look up and add: +% - Bailey, D.H. & Lopez de Prado, M., "The Deflated Sharpe Ratio" +% - Bailey, D.H. et al., "Pseudo-Mathematics and Financial Charlatanism" +% - Bailey, D.H. & Lopez de Prado, M., "The Probability of Backtest Overfitting" +% - Harvey, C.R., Liu, Y., Zhu, H., "...and the Cross-Section of Expected Returns" +% - White, H., "A Reality Check for Data Snooping" diff --git a/paper/sections/01_introduction.tex b/paper/sections/01_introduction.tex new file mode 100644 index 0000000..6f42521 --- /dev/null +++ b/paper/sections/01_introduction.tex @@ -0,0 +1,15 @@ +\section{Introduction} +\label{sec:introduction} + +% TODO content notes: +% - Motivate why backtest correctness matters: a single off-by-one index in a +% vectorized backtester can silently fabricate alpha. +% - State the concrete finding up front: a 15-hyperparameter "golden kernel" +% (K12 / Iter12) produced via genetic-algorithm search showed Sharpe 14.49 +% on BTCUSDT 1m data — and the same kernel, run honestly (bar-by-bar, no +% future information), collapses to Sharpe -0.25. +% - Frame the contribution: not just "we found a bug," but a reusable +% noise-harness methodology (Section 5) that any quant team can run against +% their own backtester to detect this class of defect using pure +% Geometric Brownian Motion data with zero real signal. +% - End with a roadmap of the paper (one sentence per remaining section). diff --git a/paper/sections/02_related_work.tex b/paper/sections/02_related_work.tex new file mode 100644 index 0000000..b0cd8ee --- /dev/null +++ b/paper/sections/02_related_work.tex @@ -0,0 +1,17 @@ +\section{Related Work} +\label{sec:related-work} + +% TODO content notes: +% - Lookahead bias / data leakage in backtesting: cite the standard +% references (e.g. Bailey & Lopez de Prado on backtest overfitting, +% "pseudo-mathematics" critiques, Probability of Backtest Overfitting). +% - Vectorized vs event-driven backtesting engines: tradeoffs in speed vs +% correctness; vectorized engines are more prone to index-alignment bugs +% because there is no explicit "current bar" boundary enforced by the loop. +% - Synthetic-data / null-model testing in finance: permutation tests, +% Monte Carlo null models, white-noise sanity checks as a general technique +% to detect overfit or leaky strategies before risking capital. +% - Position this paper: distinct from prior work in that it (a) documents a +% live, reproducible incident with full forensic trail, and (b) packages +% the diagnostic as a minimal, CI-runnable harness (Section 5) rather than +% a one-off statistical test. diff --git a/paper/sections/03_problem_formalization.tex b/paper/sections/03_problem_formalization.tex new file mode 100644 index 0000000..17db8ab --- /dev/null +++ b/paper/sections/03_problem_formalization.tex @@ -0,0 +1,18 @@ +\section{Problem Formalization} +\label{sec:formalization} + +% TODO content notes: +% - Define the backtest setting formally: price series P_t, signal S_t, +% position p_t, and the honesty constraint p_t = f(P_{<=t}, S_{<=t}) only +% (no access to P_{>t}). +% - Define lookahead bias precisely: any computation where p_t depends, +% directly or via a vectorized operation (e.g. shift(-1), rolling window +% misaligned by one bar, future-looking groupby), on P_{>t}. +% - Show the general shape of the bug class in vectorized code: a single +% missing .shift(1) or an inclusive/exclusive boundary error in a rolling +% window. Use abstract pseudocode here; the concrete K12 diff goes in +% Section 4. +% - State the falsifiability criterion that motivates Section 5: if a +% strategy is profitable on data with zero true signal (pure GBM noise), +% the profit must be an artifact of the backtest mechanics, not of the +% strategy logic. diff --git a/paper/sections/04_the_lookahead_bug.tex b/paper/sections/04_the_lookahead_bug.tex new file mode 100644 index 0000000..a28d1c4 --- /dev/null +++ b/paper/sections/04_the_lookahead_bug.tex @@ -0,0 +1,17 @@ +\section{The K12 Lookahead Bug} +\label{sec:the-bug} + +% TODO content notes (fill in once audit/input/ is populated and audited): +% - Identify the exact line(s) in backtester.py responsible for the leak. +% - Show a minimal before/after diff. +% - Explain mechanically why the genetic algorithm search (Iter12, 15 +% hyperparameters) was able to find and exploit this leak: GA optimizes +% whatever signal is available, including backtest-mechanics artifacts: if +% the fitness function rewards future-peeking, the search converges on +% parameters that maximize the exploit, not real predictive skill. +% - Quantify the leak's effect size on the original (real BTCUSDT) data: +% Sharpe under the bug vs Sharpe under time_machine.py (the honest engine), +% same hyperparameters, same data. +% - This section depends on the forensic files under audit/input/code/ — +% do not fill in specifics until that audit is complete (see MANIFEST.md +% for the exact commit hash and dataset hash being audited). diff --git a/paper/sections/05_noise_harness_methodology.tex b/paper/sections/05_noise_harness_methodology.tex new file mode 100644 index 0000000..f18d385 --- /dev/null +++ b/paper/sections/05_noise_harness_methodology.tex @@ -0,0 +1,20 @@ +\section{The Noise Harness Methodology} +\label{sec:noise-harness} + +% TODO content notes: +% - Describe the GBM null-data generator: dS = mu*S*dt + sigma*S*dW, fixed +% seed, parameters (mu, sigma, N steps, start price) documented in +% experiments/README.md and reproduced in 01_generate_gbm.py. +% - Key property to state explicitly: this series has zero exploitable +% structure by construction — no autocorrelation edge, no regime, nothing +% a real strategy could legitimately learn. +% - Define the test: run the same kernel/backtester pipeline against N +% independent GBM seeds. A backtester free of lookahead bias should +% produce a Sharpe distribution centered at ~0 across seeds. A backtester +% with a leak will produce a systematically positive Sharpe regardless of +% seed, because the "edge" comes from the mechanics, not the data. +% - State this as a pass/fail CI gate: mean Sharpe over >= 30 seeds must +% fall within a pre-registered null band (e.g. -0.3 to 0.3); anything +% outside that band fails the build. This is what +% .github/workflows/noise-harness.yml is wired to enforce once +% experiments/05_noise_harness.py exists. diff --git a/paper/sections/06_experimental_setup.tex b/paper/sections/06_experimental_setup.tex new file mode 100644 index 0000000..84f9b28 --- /dev/null +++ b/paper/sections/06_experimental_setup.tex @@ -0,0 +1,23 @@ +\section{Experimental Setup} +\label{sec:experimental-setup} + +% TODO content notes: +% - Enumerate the 5 experiments (full spec lives in experiments/README.md, +% this section is the paper-facing summary): +% 1. Baseline replication: run K12 golden hyperparameters on real BTCUSDT +% 1m data, buggy backtester, reproduce Sharpe 14.49. +% 2. Honest replication: same hyperparameters, same data, time_machine.py +% (bar-by-bar, no lookahead), reproduce Sharpe -0.25. +% 3. Noise harness on buggy backtester: same hyperparameters, >=30 GBM +% seeds, buggy backtester. Expect systematically positive Sharpe. +% 4. Noise harness on honest backtester: same setup, time_machine.py. +% Expect Sharpe distribution centered at 0. +% 5. Sensitivity check: vary the lookahead window size synthetically +% (1-bar through N-bar leak) to show Sharpe scales with leak size, not +% coincidence. +% - State software/hardware environment: pinned in env/requirements.txt, +% env/python_version.txt, env/os_info.txt under audit/input/ (forensic) +% and experiments/requirements.txt (reproduction environment going +% forward, which may differ in version but not in semantics). +% - State exactly which files are authoritative for each experiment number +% once experiments/0N_*.py scripts exist. diff --git a/paper/sections/07_results.tex b/paper/sections/07_results.tex new file mode 100644 index 0000000..f51b3e3 --- /dev/null +++ b/paper/sections/07_results.tex @@ -0,0 +1,19 @@ +\section{Results} +\label{sec:results} + +% TODO content notes: +% - Table 1: side-by-side Sharpe (and other metrics: max drawdown, win rate, +% total return) for buggy vs honest backtester on real data. This is the +% "leak signature" headline result. +% - Figure 1 (leak_signature.png): equity curve comparison, buggy vs honest, +% same hyperparameters, real data. +% - Figure 2 (noise_control.png): histogram/distribution of Sharpe across +% >=30 GBM seeds, buggy backtester overlaid with honest backtester. The +% buggy distribution should be visibly shifted positive; the honest one +% centered near 0. +% - Figure 3 (fix_comparison.png): before/after of the actual code diff that +% fixed the leak, annotated with the Sharpe delta it caused. +% - Do not write actual numbers into this section until results/raw/*.json +% exist and have been validated against the audit/input/results/ originals +% (see MANIFEST.md hashes). Every number in this section must be traceable +% to a specific file + seed + commit. diff --git a/paper/sections/08_discussion_and_conclusion.tex b/paper/sections/08_discussion_and_conclusion.tex new file mode 100644 index 0000000..88831a3 --- /dev/null +++ b/paper/sections/08_discussion_and_conclusion.tex @@ -0,0 +1,22 @@ +\section{Discussion and Conclusion} +\label{sec:discussion} + +% TODO content notes: +% - Generalize beyond this one kernel: any vectorized backtester using +% pandas/numpy rolling/shift operations is at risk of this exact class of +% bug; it is not specific to genetic-algorithm-discovered strategies. +% - Practical recommendation: every backtesting pipeline should run the +% noise harness (Section 5) as a standing CI gate, the same way unit tests +% gate merges — not as a one-off audit. +% - Limitations: the noise harness detects backtest-mechanics leaks; it does +% NOT detect overfitting to real historical data (that is a distinct +% failure mode requiring out-of-sample / walk-forward validation, +% out of scope here). +% - Disclosure note: state plainly that this defect was found in an +% internal/proprietary research pipeline (Sentinel Research), and that +% this paper publishes the diagnostic methodology and a minimal +% reproduction, not the proprietary strategy code itself. +% - One-paragraph conclusion restating the core claim: Sharpe 14.49 on pure +% noise is not skill, it is a bug signature, and the fix collapsed it to +% Sharpe -0.25 — exactly the kind of result a noise harness exists to +% catch before capital is at risk. diff --git a/results/.gitkeep b/results/.gitkeep new file mode 100644 index 0000000..e69de29