{
  "version": "2.0.0",
  "generated": "2026-05-12",
  "count": 28,
  "disclaimer": "These benchmarks reference datasets that are NOT publicly accessible. Listed for industry-relevance reference only. Compiled from public statements, publications, and press releases. Access is gated by collaboration, data-sharing agreements, or remains closed entirely.",
  "private_benchmarks": [
    {
      "id": "pfizer-phase2-internal",
      "name": "Pfizer Phase II Trial Benchmark Dataset",
      "owner": "Pfizer",
      "owner_type": "pharma",
      "stage": "phase-ii",
      "modality": "cross-modality",
      "task": "Phase II efficacy and endpoint prediction from historical proprietary trial data",
      "public_proxy": "hint-trialbench",
      "access_mechanism": "closed",
      "evidence_source": "Pfizer AI/ML publications 2024-2025 reference internal trial benchmarks for endpoint modeling; data not released.",
      "estimated_size": "~1500 trials, 40,000 patients",
      "why_it_matters": "Real-world trial outcomes beat public synthetic sets; models trained here inform go/no-go on live programs.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "novartis-nibr-therapeutic-data",
      "name": "NIBR Therapeutics Data (NTD)",
      "owner": "Novartis NIBR",
      "owner_type": "pharma",
      "stage": "lead-id-admet",
      "modality": "small-molecule",
      "task": "Internal compound bioactivity + phenotypic + ADMET cross-referenced against clinical outcomes",
      "public_proxy": "chembl",
      "access_mechanism": "collaboration",
      "evidence_source": "Described partially in Nat Rev Drug Discov 2023 (Schneider et al.) and NIBR public talks; ~4M compound entries referenced.",
      "estimated_size": "~4M compounds, 20M assays",
      "why_it_matters": "Largest single-sponsor ADMET dataset in industry; several academic collaborations have gated access.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "astrazeneca-cas-dmpk",
      "name": "AstraZeneca CAS-backed DMPK Benchmarks",
      "owner": "AstraZeneca",
      "owner_type": "pharma",
      "stage": "lead-id-admet",
      "modality": "small-molecule",
      "task": "Compound DMPK prediction (clearance, PPB, Vd, half-life) on internal AZ library",
      "public_proxy": "tdc-admet",
      "access_mechanism": "closed",
      "evidence_source": "AZ MolecularAI papers (2021-2024) cite private DMPK evaluation sets; AZ-Karolinska joint releases expose subsets.",
      "estimated_size": "~200k measured DMPK endpoints",
      "why_it_matters": "AZ uses these benchmarks internally to gate compound progression.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "merck-admet-demystify",
      "name": "Merck Internal ADMET Benchmark (Demystifying ADMET)",
      "owner": "Merck & Co.",
      "owner_type": "pharma",
      "stage": "lead-id-admet",
      "modality": "small-molecule",
      "task": "ADMET property prediction across proprietary MSD compound library",
      "public_proxy": "polaris-admet",
      "access_mechanism": "closed",
      "evidence_source": "Demystifying ADMET-AI publications (J Chem Inf Model, 2023; 2025) reference internal benchmark held out from model training.",
      "estimated_size": "~150k compounds across 17 endpoints",
      "why_it_matters": "Drives compound-prioritization decisions; model performance numbers published but data not.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "genentech-gred-sar",
      "name": "Genentech gRED Structure-Activity Dataset",
      "owner": "Genentech gRED",
      "owner_type": "pharma",
      "stage": "hit-id",
      "modality": "small-molecule",
      "task": "Structure-activity modeling on active research programs",
      "public_proxy": "chembl",
      "access_mechanism": "closed",
      "evidence_source": "Genentech ML team presentations (NeurIPS AI4Science, ML4MolEng 2024).",
      "estimated_size": "undisclosed; referenced as 'millions of assay points'",
      "why_it_matters": "Used to benchmark LLMs and foundation models before internal deployment.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "recursion-full-phenomics",
      "name": "Recursion Full Phenomics Dataset",
      "owner": "Recursion Pharmaceuticals",
      "owner_type": "pharma",
      "stage": "hit-id",
      "modality": "small-molecule",
      "task": "Morphological phenotypic profiling at petabyte scale",
      "public_proxy": "rxrx3-phenomics-bench",
      "access_mechanism": "conditional-access",
      "evidence_source": "Recursion SEC filings, Phenom-1 paper; public RxRx1/3 subsets are a fraction of the full 50+ PB dataset.",
      "estimated_size": "~50 PB images, 18M compound wells",
      "why_it_matters": "The actual signal powering Phenom-1, Phenom-2 foundation models.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "insilico-longevity-full",
      "name": "Insilico Longevity Benchmark (Full Dataset)",
      "owner": "Insilico Medicine",
      "owner_type": "pharma",
      "stage": "virtual-cell",
      "modality": "cross-modality",
      "task": "Aging prediction, lifespan extension, geroprotector scoring on proprietary longevity cohorts",
      "public_proxy": "longevity-bench-insilico",
      "access_mechanism": "conditional-access",
      "evidence_source": "Insilico Longevity Benchmark portal exposes public subsets; full dataset references internal clinical collaborations.",
      "estimated_size": "~1M individuals across NHANES + internal cohorts, 500k methylation samples",
      "why_it_matters": "World's largest curated longevity-relevant dataset; powers Insilico's aging drug programs.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "moderna-mrna-internal",
      "name": "Moderna mRNA Design Internal Benchmark",
      "owner": "Moderna",
      "owner_type": "pharma",
      "stage": "hit-id",
      "modality": "rna-therapeutic",
      "task": "Codon optimization, UTR design, translation efficiency prediction",
      "public_proxy": "mrna-design-bench",
      "access_mechanism": "closed",
      "evidence_source": "Moderna publications reference proprietary expression datasets for mRNA design validation.",
      "estimated_size": "~100k constructs with HEK293 and primary-cell expression readout",
      "why_it_matters": "Critical for mRNA therapeutic design; informs every Moderna pipeline program.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "pfizer-mrna-internal",
      "name": "Pfizer mRNA/LNP Internal Benchmark",
      "owner": "Pfizer",
      "owner_type": "pharma",
      "stage": "hit-id",
      "modality": "rna-therapeutic",
      "task": "LNP formulation + mRNA design optimization",
      "public_proxy": "mrna-design-bench",
      "access_mechanism": "closed",
      "evidence_source": "Pfizer-BioNTech publications cite proprietary LNP-mRNA benchmark sets.",
      "estimated_size": "undisclosed",
      "why_it_matters": "Underpins Comirnaty and next-gen mRNA vaccines.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "deep-genomics-rna",
      "name": "Deep Genomics RNA Therapeutics Benchmark",
      "owner": "Deep Genomics",
      "owner_type": "biotech",
      "stage": "hit-id",
      "modality": "rna-therapeutic",
      "task": "Splicing prediction, antisense oligonucleotide design",
      "public_proxy": "mrna-design-bench",
      "access_mechanism": "collaboration",
      "evidence_source": "Deep Genomics AI platform papers and press releases; BigRNA model training set is proprietary.",
      "estimated_size": "undisclosed",
      "why_it_matters": "Powers BigRNA foundation model.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "isomorphic-internal",
      "name": "Isomorphic Labs Internal Structure/Docking Benchmark",
      "owner": "Isomorphic Labs (Alphabet)",
      "owner_type": "biotech",
      "stage": "hit-id",
      "modality": "cross-modality",
      "task": "Structure-based drug design evaluation",
      "public_proxy": "plinder-v2",
      "access_mechanism": "closed",
      "evidence_source": "Isomorphic Labs public statements and AlphaFold 3 paper supplementary reference proprietary evaluation held out from training.",
      "estimated_size": "undisclosed",
      "why_it_matters": "Ground truth for AlphaFold 3+ and future Isomorphic models.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "meta-fair-protein-internal",
      "name": "Meta FAIR Protein Design Internal Eval",
      "owner": "Meta FAIR / EvolutionaryScale",
      "owner_type": "biotech",
      "stage": "hit-id",
      "modality": "biologic",
      "task": "ESM / ESM3 evaluation on internal proprietary fitness experiments",
      "public_proxy": "proteingym",
      "access_mechanism": "closed",
      "evidence_source": "ESM3 paper supplementary mentions internal held-out wet-lab set.",
      "estimated_size": "undisclosed",
      "why_it_matters": "Validates claimed generalization for Meta/EvolutionaryScale's protein foundation models.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "ibm-rxn-retrosyn-internal",
      "name": "IBM RXN Internal Retrosynthesis Benchmark",
      "owner": "IBM Research",
      "owner_type": "tech",
      "stage": "hit-id",
      "modality": "small-molecule",
      "task": "Retrosynthesis route planning on proprietary rule sets",
      "public_proxy": "uspto-retrosyn",
      "access_mechanism": "collaboration",
      "evidence_source": "IBM RXN papers reference internal datasets beyond USPTO public set.",
      "estimated_size": "~5-10M proprietary reactions beyond USPTO",
      "why_it_matters": "Deeper retrosynthesis benchmark for industrial routes.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "fda-cdrh-ai-validation",
      "name": "FDA CDRH Internal AI Validation Sets",
      "owner": "FDA Center for Devices and Radiological Health",
      "owner_type": "regulatory",
      "stage": "post-market-rwe",
      "modality": "cross-modality",
      "task": "Regulatory validation of medical AI/ML devices",
      "public_proxy": "faers-bench",
      "access_mechanism": "closed",
      "evidence_source": "FDA CDRH AI/ML Action Plan (2021-2025) mentions internal validation datasets for 510(k) and De Novo pathway.",
      "estimated_size": "undisclosed",
      "why_it_matters": "Regulatory gold standard for AI-device approval.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "open-targets-pharma-partners",
      "name": "Open Targets Pharma Partner Extensions",
      "owner": "Open Targets consortium",
      "owner_type": "consortium",
      "stage": "target-id",
      "modality": "cross-modality",
      "task": "Target-disease association with proprietary pharma datasets layered on",
      "public_proxy": "open-targets",
      "access_mechanism": "data-sharing-agreement",
      "evidence_source": "Open Targets consortium papers (GSK, Sanofi, BMS, Pfizer) reference member-only data extensions.",
      "estimated_size": "undisclosed",
      "why_it_matters": "Adds internal essentiality/chemogenomics signal to the public target-disease graph.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "bms-internal-sar",
      "name": "Bristol-Myers Squibb Internal SAR Benchmark",
      "owner": "Bristol-Myers Squibb",
      "owner_type": "pharma",
      "stage": "hit-id",
      "modality": "small-molecule",
      "task": "Structure-activity modeling on internal BMS research programs",
      "public_proxy": "chembl",
      "access_mechanism": "closed",
      "evidence_source": "BMS AI papers (Schuffenhauer et al.) cite proprietary SAR sets.",
      "estimated_size": "~1M assay points",
      "why_it_matters": "Industry benchmark for generative chemistry deployment.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "valence-internal-expand",
      "name": "Valence Labs Internal ADMET Extensions",
      "owner": "Valence Labs (Recursion)",
      "owner_type": "biotech",
      "stage": "lead-id-admet",
      "modality": "small-molecule",
      "task": "ADMET prediction on Recursion-generated compounds beyond Polaris-public releases",
      "public_proxy": "polaris-admet",
      "access_mechanism": "conditional-access",
      "evidence_source": "Valence/Recursion publications + Polaris Hub description of partial data release.",
      "estimated_size": "~150k compounds",
      "why_it_matters": "Full set trains Recursion's ADMET foundation models.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "sanofi-internal-pk",
      "name": "Sanofi Internal PK/PD Benchmark",
      "owner": "Sanofi",
      "owner_type": "pharma",
      "stage": "ind-enabling",
      "modality": "small-molecule",
      "task": "PK projection, interspecies scaling",
      "public_proxy": "pkpd-obach",
      "access_mechanism": "closed",
      "evidence_source": "Sanofi AI collaborations with Exscientia reference internal PK benchmarks.",
      "estimated_size": "undisclosed",
      "why_it_matters": "Drives AI-enabled PK modeling decisions.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "gilead-hcv-antiviral",
      "name": "Gilead Internal Antiviral Benchmark",
      "owner": "Gilead Sciences",
      "owner_type": "pharma",
      "stage": "hit-id",
      "modality": "small-molecule",
      "task": "Antiviral potency modeling (HCV, HIV, SARS-CoV-2) on proprietary compound archive",
      "public_proxy": "asap-antiviral-2025",
      "access_mechanism": "closed",
      "evidence_source": "Gilead research publications reference internal antiviral screening collections.",
      "estimated_size": "~500k compounds antiviral screened",
      "why_it_matters": "Real-world antiviral MoA coverage not seen in public sets.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "genomic-health-rwe",
      "name": "Flatiron Health Real-World Oncology Benchmark",
      "owner": "Flatiron Health (Roche)",
      "owner_type": "pharma",
      "stage": "post-market-rwe",
      "modality": "cross-modality",
      "task": "Real-world outcomes and patient journey modeling",
      "public_proxy": "mimic-benchmark",
      "access_mechanism": "data-sharing-agreement",
      "evidence_source": "Flatiron publications describe de-identified oncology EHR + outcomes as a proprietary benchmark for AI models.",
      "estimated_size": "~4M oncology patients",
      "why_it_matters": "Gold standard for RWE AI; widely licensed to pharma.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "tempus-onco-ai",
      "name": "Tempus Oncology AI Benchmark",
      "owner": "Tempus Labs",
      "owner_type": "biotech",
      "stage": "target-id",
      "modality": "cross-modality",
      "task": "Multi-omic oncology prediction on Tempus's clinical-molecular database",
      "public_proxy": "cptac-proteogenomic",
      "access_mechanism": "data-sharing-agreement",
      "evidence_source": "Tempus publications (Nat Med 2023-2025) reference internal benchmarks.",
      "estimated_size": "~200k sequenced patients + outcomes",
      "why_it_matters": "Feeds Tempus AI oncology product.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "ginkgo-biologics",
      "name": "Ginkgo Bioworks Biologics Design Benchmark",
      "owner": "Ginkgo Bioworks",
      "owner_type": "biotech",
      "stage": "hit-id",
      "modality": "biologic",
      "task": "Protein engineering and enzyme design across Ginkgo's foundry",
      "public_proxy": "protein-design-bench-2026",
      "access_mechanism": "collaboration",
      "evidence_source": "Ginkgo AI publications and press releases reference foundry-scale internal evaluation.",
      "estimated_size": "~millions of enzyme variants with activity measurements",
      "why_it_matters": "Ground truth for industrial enzyme design.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "roche-pred-admet",
      "name": "Roche pRED ADMET Benchmark",
      "owner": "Roche pRED",
      "owner_type": "pharma",
      "stage": "lead-id-admet",
      "modality": "small-molecule",
      "task": "ADMET modeling on Roche's internal library",
      "public_proxy": "tdc-admet",
      "access_mechanism": "closed",
      "evidence_source": "Roche pRED publications on ML-ADMET reference proprietary datasets.",
      "estimated_size": "~300k compound-endpoint pairs",
      "why_it_matters": "Underpins Roche's internal ML-ADMET deployment.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "chugai-ab",
      "name": "Chugai Antibody Engineering Benchmark",
      "owner": "Chugai Pharmaceutical (Roche)",
      "owner_type": "pharma",
      "stage": "hit-id",
      "modality": "biologic",
      "task": "Bispecific and switch antibody optimization",
      "public_proxy": "therapeutic-antibody-bench-2026",
      "access_mechanism": "collaboration",
      "evidence_source": "Chugai publications reference internal switch-antibody dataset.",
      "estimated_size": "undisclosed",
      "why_it_matters": "Unique bispecific / switch format benchmark.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "exscientia-design",
      "name": "Exscientia Precision Medicine Benchmark",
      "owner": "Exscientia",
      "owner_type": "biotech",
      "stage": "hit-id",
      "modality": "small-molecule",
      "task": "Generative design + phenotypic outcome modeling on proprietary drug programs",
      "public_proxy": "chembl",
      "access_mechanism": "closed",
      "evidence_source": "Exscientia publications and 10-K references for Centaur AI outputs.",
      "estimated_size": "undisclosed",
      "why_it_matters": "Precision medicine + generative chem coupled benchmark.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "xaira-foundation-internal",
      "name": "Xaira Therapeutics Foundation Benchmark",
      "owner": "Xaira Therapeutics",
      "owner_type": "biotech",
      "stage": "virtual-cell",
      "modality": "cross-modality",
      "task": "Virtual-cell + target-based design on proprietary multi-omic panels",
      "public_proxy": "virtualcellbench-2026",
      "access_mechanism": "closed",
      "evidence_source": "Xaira launch materials (Apr 2024) and subsequent publications mention proprietary benchmarks for foundation models.",
      "estimated_size": "undisclosed; built on Illumina-scale partnerships",
      "why_it_matters": "New well-funded entrant; benchmarks shape entire platform.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "open-problems-sponsor-private",
      "name": "Open Problems Sponsor-Private Challenge Data",
      "owner": "Open Problems consortium (sponsors: 10x Genomics, Chan Zuckerberg Biohub)",
      "owner_type": "consortium",
      "stage": "virtual-cell",
      "modality": "cross-modality",
      "task": "Sponsor-provided test sets for Kaggle-hosted competitions",
      "public_proxy": "openproblems-perturbation",
      "access_mechanism": "conditional-access",
      "evidence_source": "Kaggle Open Problems competition rules; test labels held out until competition close.",
      "estimated_size": "varies per competition (~100k cells each)",
      "why_it_matters": "Partially-private model for prospective benchmark evaluation.",
      "date_sourced": "2026-05-12"
    },
    {
      "id": "takeda-pipeline-ai",
      "name": "Takeda Internal AI Pipeline Benchmark",
      "owner": "Takeda",
      "owner_type": "pharma",
      "stage": "lead-id-admet",
      "modality": "small-molecule",
      "task": "ADMET + PK on Takeda internal compound archive",
      "public_proxy": "tdc-admet",
      "access_mechanism": "collaboration",
      "evidence_source": "Takeda AI platform publications reference internal evaluation set.",
      "estimated_size": "undisclosed",
      "why_it_matters": "Japan-based pharma with distinct chemistry class distribution.",
      "date_sourced": "2026-05-12"
    }
  ]
}