array(41) {
["request_overridden_res"]=>
string(1) "3"
["project_status"]=>
string(30) "approved_pending_dua_signature"
["project_assoc_trials"]=>
array(3) {
[0]=>
object(WP_Post)#5038 (24) {
["ID"]=>
int(1920)
["post_author"]=>
string(4) "1363"
["post_date"]=>
string(19) "2021-11-23 15:30:00"
["post_date_gmt"]=>
string(19) "2021-11-23 15:30:00"
["post_content"]=>
string(0) ""
["post_title"]=>
string(278) "NCT02257736 - A Phase 3 Randomized, Placebo-controlled Double-blind Study of JNJ-56021927 in Combination With Abiraterone Acetate and Prednisone Versus Abiraterone Acetate and Prednisone in Subjects With Chemotherapy-naive Metastatic Castration-resistant Prostate Cancer (mCRPC)"
["post_excerpt"]=>
string(0) ""
["post_status"]=>
string(7) "publish"
["comment_status"]=>
string(6) "closed"
["ping_status"]=>
string(6) "closed"
["post_password"]=>
string(0) ""
["post_name"]=>
string(197) "nct02257736-a-phase-3-randomized-placebo-controlled-double-blind-study-of-jnj-56021927-in-combination-with-abiraterone-acetate-and-prednisone-versus-abiraterone-acetate-and-prednisone-in-subjects-w"
["to_ping"]=>
string(0) ""
["pinged"]=>
string(0) ""
["post_modified"]=>
string(19) "2025-04-30 16:19:18"
["post_modified_gmt"]=>
string(19) "2025-04-30 20:19:18"
["post_content_filtered"]=>
string(0) ""
["post_parent"]=>
int(0)
["guid"]=>
string(246) "https://dev-yoda.pantheonsite.io/clinical-trial/nct02257736-a-phase-3-randomized-placebo-controlled-double-blind-study-of-jnj-56021927-in-combination-with-abiraterone-acetate-and-prednisone-versus-abiraterone-acetate-and-prednisone-in-subjects-w/"
["menu_order"]=>
int(0)
["post_type"]=>
string(14) "clinical_trial"
["post_mime_type"]=>
string(0) ""
["comment_count"]=>
string(1) "0"
["filter"]=>
string(3) "raw"
}
[1]=>
object(WP_Post)#5040 (24) {
["ID"]=>
int(1568)
["post_author"]=>
string(4) "1363"
["post_date"]=>
string(19) "2016-10-31 14:30:00"
["post_date_gmt"]=>
string(19) "2016-10-31 14:30:00"
["post_content"]=>
string(0) ""
["post_title"]=>
string(223) "NCT00887198 - A Phase 3, Randomized, Double-blind, Placebo-Controlled Study of Abiraterone Acetate (CB7630) Plus Prednisone in Asymptomatic or Mildly Symptomatic Patients With Metastatic Castration-Resistant Prostate Cancer"
["post_excerpt"]=>
string(0) ""
["post_status"]=>
string(7) "publish"
["comment_status"]=>
string(6) "closed"
["ping_status"]=>
string(6) "closed"
["post_password"]=>
string(0) ""
["post_name"]=>
string(193) "nct00887198-a-phase-3-randomized-double-blind-placebo-controlled-study-of-abiraterone-acetate-cb7630-plus-prednisone-in-asymptomatic-or-mildly-symptomatic-patients-with-metastatic-castration-re"
["to_ping"]=>
string(0) ""
["pinged"]=>
string(0) ""
["post_modified"]=>
string(19) "2025-04-30 15:32:35"
["post_modified_gmt"]=>
string(19) "2025-04-30 19:32:35"
["post_content_filtered"]=>
string(0) ""
["post_parent"]=>
int(0)
["guid"]=>
string(242) "https://dev-yoda.pantheonsite.io/clinical-trial/nct00887198-a-phase-3-randomized-double-blind-placebo-controlled-study-of-abiraterone-acetate-cb7630-plus-prednisone-in-asymptomatic-or-mildly-symptomatic-patients-with-metastatic-castration-re/"
["menu_order"]=>
int(0)
["post_type"]=>
string(14) "clinical_trial"
["post_mime_type"]=>
string(0) ""
["comment_count"]=>
string(1) "0"
["filter"]=>
string(3) "raw"
}
[2]=>
object(WP_Post)#5039 (24) {
["ID"]=>
int(1845)
["post_author"]=>
string(4) "1363"
["post_date"]=>
string(19) "2019-12-12 12:23:00"
["post_date_gmt"]=>
string(19) "2019-12-12 12:23:00"
["post_content"]=>
string(0) ""
["post_title"]=>
string(257) "NCT01715285 - A Randomized, Double-blind, Comparative Study of Abiraterone Acetate Plus Low-Dose Prednisone Plus Androgen Deprivation Therapy (ADT) Versus ADT Alone in Newly Diagnosed Subjects With High-Risk, Metastatic Hormone-naive Prostate Cancer (mHNPC)"
["post_excerpt"]=>
string(0) ""
["post_status"]=>
string(7) "publish"
["comment_status"]=>
string(6) "closed"
["ping_status"]=>
string(6) "closed"
["post_password"]=>
string(0) ""
["post_name"]=>
string(193) "nct01715285-a-randomized-double-blind-comparative-study-of-abiraterone-acetate-plus-low-dose-prednisone-plus-androgen-deprivation-therapy-adt-versus-adt-alone-in-newly-diagnosed-subjects-with-h"
["to_ping"]=>
string(0) ""
["pinged"]=>
string(0) ""
["post_modified"]=>
string(19) "2025-04-30 16:02:44"
["post_modified_gmt"]=>
string(19) "2025-04-30 20:02:44"
["post_content_filtered"]=>
string(0) ""
["post_parent"]=>
int(0)
["guid"]=>
string(242) "https://dev-yoda.pantheonsite.io/clinical-trial/nct01715285-a-randomized-double-blind-comparative-study-of-abiraterone-acetate-plus-low-dose-prednisone-plus-androgen-deprivation-therapy-adt-versus-adt-alone-in-newly-diagnosed-subjects-with-h/"
["menu_order"]=>
int(0)
["post_type"]=>
string(14) "clinical_trial"
["post_mime_type"]=>
string(0) ""
["comment_count"]=>
string(1) "0"
["filter"]=>
string(3) "raw"
}
}
["project_title"]=>
string(102) "Estimating Conditional Average Treatment Effects from Cancer Trial Data Using Causal Survival Analysis"
["project_narrative_summary"]=>
string(810) "Patients with cancer often respond differently to the same treatment, thus pointing to the potential benefit of personalized care. Clinical trial data can help identify patient groups likely to benefit, but this data is often incomplete due to early treatment stops or missing follow-up. We aim to predict the individual-level effectiveness using state-of-the-art methods that can handle such incomplete data while uncovering meaningful treatment response patterns. By showing which patients benefit more or less, we aim to support evidence generation for personalizing treatment strategies and identifying predictive biomarkers (e.g., tumor characteristics) that predict treatment response. Altogether, our work will contribute to guiding more effective, personalized cancer treatments in real-world settings."
["project_learn_source"]=>
string(12) "scien_public"
["principal_investigator"]=>
array(7) {
["first_name"]=>
string(6) "Stefan"
["last_name"]=>
string(11) "Feuerriegel"
["degree"]=>
string(5) "Ph.D."
["primary_affiliation"]=>
string(10) "LMU Munich"
["email"]=>
string(18) "feuerriegel@lmu.de"
["state_or_province"]=>
string(7) "Bavaria"
["country"]=>
string(7) "Germany"
}
["project_key_personnel"]=>
array(1) {
[0]=>
array(6) {
["p_pers_f_name"]=>
string(5) "Yuxin"
["p_pers_l_name"]=>
string(4) "Wang"
["p_pers_degree"]=>
string(5) "M.Sc."
["p_pers_pr_affil"]=>
string(10) "LMU Munich"
["p_pers_scop_id"]=>
string(0) ""
["requires_data_access"]=>
string(3) "yes"
}
}
["project_ext_grants"]=>
array(2) {
["value"]=>
string(3) "yes"
["label"]=>
string(65) "External grants or funds are being used to support this research."
}
["project_funding_source"]=>
string(231) "The research is supported by Munich Center Machine Learning (MCML) and the DAAD programme Konrad Zuse Schools of Excellence in Artificial Intelligence, sponsored by the Federal Ministry of Research, Technology and Space in Germany."
["project_date_type"]=>
string(18) "full_crs_supp_docs"
["property_scientific_abstract"]=>
string(1374) "Background:
Estimating conditional average treatment effects (CATEs) from censored data could help in personalized medicine, especially by identifying patient subgroups that benefit from treatment and by locating predictive biomarkers (see our paper in Nature Medicine [Feuerriegel et al. 2024] that discusses the importance of the CATE for personalized medicine). The key empirical challenge is censoring, which can bias the estimation of CATEs.
Objective:
To estimate the CATE using an assumption-lean method that provides partial identification of CATE in censored datasets.
Study Design:
This is an empirical study using clinical trial data. We construct informative bounds for CATEs in the presence of censoring to generate robust clinical evidence.
Participants:
All patients enrolled in the trial provided via the YODA platform met the inclusion criteria (complete treatment assignment data).
Primary and Secondary Outcome Measure(s):
The primary outcomes are (i) overall survival (OS) and (ii) progression-free survival (PFS), each with associated event indicators. No secondary outcomes are planned.
Statistical Analysis:
We will compute descriptive statistics, estimate the CATE (with bounds), analyze subgroups, and perform sensitivity analyses. "
["project_brief_bg"]=>
string(2839) "In cancer care, patients often respond differently to the same treatment [Feurriegel et al. 2024]. Some treatments lead to significant benefits, such as delayed disease progression, while others may see little to no effect or even reduced survival due to side effects. Understanding this variation, often shaped by demographic characteristics, tumor type, and genetic modifications, among others, is a central challenge to promoting personalized decision-making in oncology. However, clinical trial data are frequently incomplete due to issues like dropout or loss of follow-up. This makes it difficult to determine which subgroups of patients truly benefit from a given therapy [Klein et al. 2006, Zhang et al. 2017, Wiegrebe et al. 2023, Gupta et al. 2025].
Our work aims to address this challenge. Rather than relying only on average treatment effects across the whole population, we aim to identify subgroups of patients who are more likely to benefit, based on clinical or molecular characteristics. For this, we use a state-of-the-art causal inference method (e.g., partial identification via meta-learner) for clinical trial data to better interpret outcomes that are only partially observed, such as progression-free survival. Unlike existing methods for causal inference that often assume perfect patient follow-up [Tan et al. 2006, Curth et al. 2021, Gao et al. 2021, Xu et al. 2022, Xu et al. 2024], our approach remains reliable when those conditions do not hold. As such, our aim is to generate clinical evidence that can support personalized decision-making in oncology by helping clinicians match treatments to the patients who are most likely to benefit.
Therefore, to support better treatment decisions in real-world cancer care, we aim to develop and evaluate a flexible approach that remains reliable even when patient outcomes are incomplete, such as when patients drop out of a trial or are lost to follow-up. Rather than relying on strict assumptions or complex modeling, our method accesses clinical trial data to offer a practical way to understand how different groups of patients are likely to respond to treatment. At the technical level, partial identification in our setting provides a range of likely outcomes for each group (that accounts for the partially observed data).
Our approach could provide a blueprint for evidence generation from time-to-event data in clinical trials because it enables individualized treatment decisions, even when some data are missing (e.g., due to dropout). By identifying which patients are more or less likely to benefit from a specific therapy, our work supports a more personalized and effective way of delivering cancer care. In doing so, it helps bridge the gap between data limitations and real-world clinical needs.
"
["project_specific_aims"]=>
string(941) "This project aims to identify which cancer patients are most likely to benefit from a treatment, even when patient outcomes are only partially observed. Instead of relying on strict or unrealistic assumptions as in existing methods, our method is designed to work under real-world conditions by providing a range of likely treatment outcomes for different patient subgroups (e.g., bounds on the CATE via partial identification).
Specifically, we propose and evaluate a flexible learning framework that adjusts for incomplete follow-up using minimal assumptions and can be applied with arbitrary machine learning methods and/or general clinical trial data. By accessing cancer trial data, we are aiming to demonstrate the full potential of partial identification of the CATE in uncovering patient subgroups that are most likely to benefit from treatment, thereby informing more effective and personalized therapeutic decisions."
["project_study_design"]=>
array(2) {
["value"]=>
string(8) "meth_res"
["label"]=>
string(23) "Methodological research"
}
["project_purposes"]=>
array(4) {
[0]=>
array(2) {
["value"]=>
string(56) "new_research_question_to_examine_treatment_effectiveness"
["label"]=>
string(114) "New research question to examine treatment effectiveness on secondary endpoints and/or within subgroup populations"
}
[1]=>
array(2) {
["value"]=>
string(76) "confirm_or_validate previously_conducted_research_on_treatment_effectiveness"
["label"]=>
string(76) "Confirm or validate previously conducted research on treatment effectiveness"
}
[2]=>
array(2) {
["value"]=>
string(37) "develop_or_refine_statistical_methods"
["label"]=>
string(37) "Develop or refine statistical methods"
}
[3]=>
array(2) {
["value"]=>
string(34) "research_on_clinical_trial_methods"
["label"]=>
string(34) "Research on clinical trial methods"
}
}
["project_research_methods"]=>
string(382) "Data source: We will use participant-level data from the trials provided through the YODA Project.
Inclusion criteria: All participants enrolled in the selected clinical trial(s) will be included in the analysis.
Exclusion criteria: Patients with incomplete treatment after assignment will be excluded.
No other datasets will be pooled."
["project_main_outcome_measure"]=>
string(245) "The primary outcome: (i) overall survival (OS) time and (ii) progression-free survival (PFS) time, along with an event indicator denoting whether progression / death occurred or the data were censored.
The secondary outcome: None."
["project_main_predictor_indep"]=>
string(578) "The main independent variable is the treatment group (e.g., Trial NCT02257736: “Placebo + Abiraterone Acetate + Prednisolone” vs. “Apalutamide + Abiraterone Acetate + Prednisolone”). It will be defined as the treatment indicator in the causal inference setting. This is a binary variable indicating whether a patient received the experimental or control regimen.
In addition, information about demographics such as age, sex/gender, race, ethnicity, but also tumor characteristics, will be considered as covariates and used in the CATE estimation.
"
["project_other_variables_interest"]=>
string(823) "We will consider a range of baseline covariates as additional variables for descriptive and effect modifier analysis.
These e.g. include:
- Demographics: Age (continuous), sex (male/female), and race (categorized as White, Black or African American, Asian, and Other).
- Disease characteristics: ECOG performance status (0–1 vs ≥2), number of prior therapies (categorical), cytogenetic risk profile (standard vs high risk), and ISS stage (I, II, III).
- Interventions: “Placebo + Abiraterone Acetate + Prednisolone” vs. “Apalutamide + Abiraterone Acetate + Prednisolone” (categorized as Control / Treated).
The demographics and disease characteristics will be used for confounder adjustment and to characterize subgroups in which treatment effects may differ."
["project_stat_analysis_plan"]=>
string(1301) "Descriptive analysis:
We begin by summarizing key features of the patient population. This includes computing median, mean, minimum, and maximum survival times across clinically relevant subgroups (e.g., age groups, biomarker levels, tumor stages). These statistics help characterize outcome variability and identify broad patterns.
CATE estimation with bounds:
To handle incomplete survival data, we estimate the CATE in addition to the average treatment effect (ATE) to understand treatment effect heterogeneity in the population. We further use partial identification to obtain bounds on the CATE in the presence of censoring. Specifically, our method uses a doubly robust approach and efficient influence function theory to estimate bounds and ensure valid results even when data are partially missing or models are misspecified.
Subgroup evaluation:
We evaluate treatment effects across subgroups defined by clinical characteristics. For each subgroup, we calculate the CATE bounds and identify those with consistently positive lower bounds, indicating benefit from treatment. This helps prioritize subpopulations most likely to respond to an anti-cancer drug.
All analyses will be done in R / Python within a secure environment."
["project_software_used"]=>
array(4) {
[0]=>
array(2) {
["value"]=>
string(6) "python"
["label"]=>
string(6) "Python"
}
[1]=>
array(2) {
["value"]=>
string(1) "r"
["label"]=>
string(1) "R"
}
[2]=>
array(2) {
["value"]=>
string(7) "rstudio"
["label"]=>
string(7) "RStudio"
}
[3]=>
array(2) {
["value"]=>
string(5) "stata"
["label"]=>
string(5) "STATA"
}
}
["project_timeline"]=>
string(587) "Start of project: July 2025
Completion of data cleaning and descriptive analysis: July 2025
Implementation of main methods: Done (Codes are finished and tested reliably on synthetic data)
Evaluation and statistical interpretation: August 2025
Draft manuscript ready: August 2025
First submission for publication: August 2025
Report results back to YODA: September 2025
The entire project is expected to be completed within 3 months, but we ask for access for a 12-month period to accommodate potential updates during the revision process."
["project_dissemination_plan"]=>
string(324) "We plan to submit one or two manuscripts to high-impact journals such as Nature Biomedical Engineering, PNAS, or Nature Communications. Our target audience includes researchers in medicine, epidemiology, and oncology, particularly those interested in CATE estimation and cancer trial data applications in survival analysis. "
["project_bibliography"]=>
string(2759) "
- Feuerriegel, Stefan, Dennis Frauen, Valentyn Melnychuk, Jonas Schweisthal, Konstantin Hess, Alicia Curth, Stefan Bauer, Niki Kilbertus, Isaac S. Kohane, and Mihaela van der Schaar. Causal machine learning for predicting treatment outcomes. Nature Medicine 30, no. 4: 958-968, 2024.
- John P. Klein and Melvin L. Moeschberger. Survival Analysis: Techniques for censored and truncated data. Springer Science & Business Media, 2006.
- Weijia Zhang, Thuc Duy Le, Lin Liu, Zhi-Hua Zhou, and Jiuyong Li. Mining heterogeneous causal effects for personalized cancer treatment. Bioinformatics, 33(15):2372–2378, 2017.
- Simon Wiegrebe, Philipp Kopper, Raphael Sonabend, Bernd Bischl, and Andreas Bender. Deep learning for survival analysis: a review. Artificial Intelligence Review 57, no. 3: 65, 2024.
- Tulika Rudra Gupta, Daniel Schwartz, Riddhiman Saha, Riddhiman Saha, , and . Informative censoring in externally controlled clinical trials: A potential source of bias. ESMO Open, 10(1):104094, 2025.
- Zhiqiang Tan. A distributional approach for causal inference using propensity scores. Journal of the American Statistical Association, 101(476):1619–1637, 2006.
- Alicia Curth, Changhee Lee, and Mihaela van der Schaar. Survite: Learning heterogeneous treatment effects from time-to-event data. In NeurIPS, 2021.
- Zijun Gao and Trevor Hastie. Estimating heterogeneous treatment effects for general responses, arXiv preprint, arXiv:2103.04277, 2021.
- Yizhe Xu, Nikolaos Ignatiadis, Erik Sverdrup, Scott Fleming, Stefan Wager, and Nigam Shah. Treatment heterogeneity for survival outcomes, arXiv preprint, arXiv:2103.04277, 2022.
- Shenbo Xu, Raluca Cobzaru, Stan N. Finkelstein, Roy E. Welsch, Kenney Ng, and Zach Shahn. Estimating heterogeneous treatment effects on survival outcomes using counterfactual censoring unbiased transformations, arXiv preprint, arXiv:2401.11263, 2024.
"
["project_suppl_material"]=>
bool(false)
["project_coi"]=>
array(2) {
[0]=>
array(1) {
["file_coi"]=>
array(21) {
["ID"]=>
int(17570)
["id"]=>
int(17570)
["title"]=>
string(19) "coi_form_stefan.pdf"
["filename"]=>
string(19) "coi_form_stefan.pdf"
["filesize"]=>
int(20553)
["url"]=>
string(68) "https://yoda.yale.edu/wp-content/uploads/2025/07/coi_form_stefan.pdf"
["link"]=>
string(65) "https://yoda.yale.edu/data-request/2025-0488/coi_form_stefan-pdf/"
["alt"]=>
string(0) ""
["author"]=>
string(4) "2145"
["description"]=>
string(0) ""
["caption"]=>
string(0) ""
["name"]=>
string(19) "coi_form_stefan-pdf"
["status"]=>
string(7) "inherit"
["uploaded_to"]=>
int(17569)
["date"]=>
string(19) "2025-07-04 12:28:42"
["modified"]=>
string(19) "2025-07-04 12:28:44"
["menu_order"]=>
int(0)
["mime_type"]=>
string(15) "application/pdf"
["type"]=>
string(11) "application"
["subtype"]=>
string(3) "pdf"
["icon"]=>
string(62) "https://yoda.yale.edu/wp/wp-includes/images/media/document.png"
}
}
[1]=>
array(1) {
["file_coi"]=>
array(21) {
["ID"]=>
int(17571)
["id"]=>
int(17571)
["title"]=>
string(18) "coi_form_yuxin.pdf"
["filename"]=>
string(18) "coi_form_yuxin.pdf"
["filesize"]=>
int(20188)
["url"]=>
string(67) "https://yoda.yale.edu/wp-content/uploads/2025/07/coi_form_yuxin.pdf"
["link"]=>
string(64) "https://yoda.yale.edu/data-request/2025-0488/coi_form_yuxin-pdf/"
["alt"]=>
string(0) ""
["author"]=>
string(4) "2145"
["description"]=>
string(0) ""
["caption"]=>
string(0) ""
["name"]=>
string(18) "coi_form_yuxin-pdf"
["status"]=>
string(7) "inherit"
["uploaded_to"]=>
int(17569)
["date"]=>
string(19) "2025-07-04 12:28:43"
["modified"]=>
string(19) "2025-07-04 12:28:44"
["menu_order"]=>
int(0)
["mime_type"]=>
string(15) "application/pdf"
["type"]=>
string(11) "application"
["subtype"]=>
string(3) "pdf"
["icon"]=>
string(62) "https://yoda.yale.edu/wp/wp-includes/images/media/document.png"
}
}
}
["data_use_agreement_training"]=>
bool(true)
["human_research_protection_training"]=>
bool(true)
["certification"]=>
bool(true)
["search_order"]=>
string(1) "0"
["project_send_email_updates"]=>
bool(false)
["project_publ_available"]=>
bool(true)
["project_year_access"]=>
string(0) ""
["project_rep_publ"]=>
bool(false)
["project_assoc_data"]=>
array(0) {
}
["project_due_dil_assessment"]=>
bool(false)
["project_title_link"]=>
array(21) {
["ID"]=>
int(17635)
["id"]=>
int(17635)
["title"]=>
string(28) "Data Request Approved Notice"
["filename"]=>
string(32) "Data-Request-Approved-Notice.pdf"
["filesize"]=>
int(195663)
["url"]=>
string(81) "https://yoda.yale.edu/wp-content/uploads/2025/07/Data-Request-Approved-Notice.pdf"
["link"]=>
string(77) "https://yoda.yale.edu/data-request/2025-0488/data-request-approved-notice-55/"
["alt"]=>
string(0) ""
["author"]=>
string(4) "1885"
["description"]=>
string(0) ""
["caption"]=>
string(0) ""
["name"]=>
string(31) "data-request-approved-notice-55"
["status"]=>
string(7) "inherit"
["uploaded_to"]=>
int(17569)
["date"]=>
string(19) "2025-07-17 18:12:24"
["modified"]=>
string(19) "2025-07-17 18:12:24"
["menu_order"]=>
int(0)
["mime_type"]=>
string(15) "application/pdf"
["type"]=>
string(11) "application"
["subtype"]=>
string(3) "pdf"
["icon"]=>
string(62) "https://yoda.yale.edu/wp/wp-includes/images/media/document.png"
}
["project_review_link"]=>
bool(false)
["project_highlight_button"]=>
string(0) ""
["request_data_partner"]=>
string(0) ""
}
data partner
array(1) {
[0]=>
string(0) ""
}
pi country
array(0) {
}
pi affil
array(0) {
}
products
array(0) {
}
num of trials
array(1) {
[0]=>
string(1) "0"
}
res
array(1) {
[0]=>
string(1) "3"
}
Research Proposal
Project Title:
Estimating Conditional Average Treatment Effects from Cancer Trial Data Using Causal Survival Analysis
Scientific Abstract:
Background:
Estimating conditional average treatment effects (CATEs) from censored data could help in personalized medicine, especially by identifying patient subgroups that benefit from treatment and by locating predictive biomarkers (see our paper in Nature Medicine [Feuerriegel et al. 2024] that discusses the importance of the CATE for personalized medicine). The key empirical challenge is censoring, which can bias the estimation of CATEs.
Objective:
To estimate the CATE using an assumption-lean method that provides partial identification of CATE in censored datasets.
Study Design:
This is an empirical study using clinical trial data. We construct informative bounds for CATEs in the presence of censoring to generate robust clinical evidence.
Participants:
All patients enrolled in the trial provided via the YODA platform met the inclusion criteria (complete treatment assignment data).
Primary and Secondary Outcome Measure(s):
The primary outcomes are (i) overall survival (OS) and (ii) progression-free survival (PFS), each with associated event indicators. No secondary outcomes are planned.
Statistical Analysis:
We will compute descriptive statistics, estimate the CATE (with bounds), analyze subgroups, and perform sensitivity analyses.
Brief Project Background and Statement of Project Significance:
In cancer care, patients often respond differently to the same treatment [Feurriegel et al. 2024]. Some treatments lead to significant benefits, such as delayed disease progression, while others may see little to no effect or even reduced survival due to side effects. Understanding this variation, often shaped by demographic characteristics, tumor type, and genetic modifications, among others, is a central challenge to promoting personalized decision-making in oncology. However, clinical trial data are frequently incomplete due to issues like dropout or loss of follow-up. This makes it difficult to determine which subgroups of patients truly benefit from a given therapy [Klein et al. 2006, Zhang et al. 2017, Wiegrebe et al. 2023, Gupta et al. 2025].
Our work aims to address this challenge. Rather than relying only on average treatment effects across the whole population, we aim to identify subgroups of patients who are more likely to benefit, based on clinical or molecular characteristics. For this, we use a state-of-the-art causal inference method (e.g., partial identification via meta-learner) for clinical trial data to better interpret outcomes that are only partially observed, such as progression-free survival. Unlike existing methods for causal inference that often assume perfect patient follow-up [Tan et al. 2006, Curth et al. 2021, Gao et al. 2021, Xu et al. 2022, Xu et al. 2024], our approach remains reliable when those conditions do not hold. As such, our aim is to generate clinical evidence that can support personalized decision-making in oncology by helping clinicians match treatments to the patients who are most likely to benefit.
Therefore, to support better treatment decisions in real-world cancer care, we aim to develop and evaluate a flexible approach that remains reliable even when patient outcomes are incomplete, such as when patients drop out of a trial or are lost to follow-up. Rather than relying on strict assumptions or complex modeling, our method accesses clinical trial data to offer a practical way to understand how different groups of patients are likely to respond to treatment. At the technical level, partial identification in our setting provides a range of likely outcomes for each group (that accounts for the partially observed data).
Our approach could provide a blueprint for evidence generation from time-to-event data in clinical trials because it enables individualized treatment decisions, even when some data are missing (e.g., due to dropout). By identifying which patients are more or less likely to benefit from a specific therapy, our work supports a more personalized and effective way of delivering cancer care. In doing so, it helps bridge the gap between data limitations and real-world clinical needs.
Specific Aims of the Project:
This project aims to identify which cancer patients are most likely to benefit from a treatment, even when patient outcomes are only partially observed. Instead of relying on strict or unrealistic assumptions as in existing methods, our method is designed to work under real-world conditions by providing a range of likely treatment outcomes for different patient subgroups (e.g., bounds on the CATE via partial identification).
Specifically, we propose and evaluate a flexible learning framework that adjusts for incomplete follow-up using minimal assumptions and can be applied with arbitrary machine learning methods and/or general clinical trial data. By accessing cancer trial data, we are aiming to demonstrate the full potential of partial identification of the CATE in uncovering patient subgroups that are most likely to benefit from treatment, thereby informing more effective and personalized therapeutic decisions.
Study Design:
Methodological research
What is the purpose of the analysis being proposed? Please select all that apply.:
New research question to examine treatment effectiveness on secondary endpoints and/or within subgroup populations
Confirm or validate previously conducted research on treatment effectiveness
Develop or refine statistical methods
Research on clinical trial methods
Software Used:
Python, R, RStudio, STATA
Data Source and Inclusion/Exclusion Criteria to be used to define the patient sample for your study:
Data source: We will use participant-level data from the trials provided through the YODA Project.
Inclusion criteria: All participants enrolled in the selected clinical trial(s) will be included in the analysis.
Exclusion criteria: Patients with incomplete treatment after assignment will be excluded.
No other datasets will be pooled.
Primary and Secondary Outcome Measure(s) and how they will be categorized/defined for your study:
The primary outcome: (i) overall survival (OS) time and (ii) progression-free survival (PFS) time, along with an event indicator denoting whether progression / death occurred or the data were censored.
The secondary outcome: None.
Main Predictor/Independent Variable and how it will be categorized/defined for your study:
The main independent variable is the treatment group (e.g., Trial NCT02257736: "Placebo + Abiraterone Acetate + Prednisolone" vs. "Apalutamide + Abiraterone Acetate + Prednisolone"). It will be defined as the treatment indicator in the causal inference setting. This is a binary variable indicating whether a patient received the experimental or control regimen.
In addition, information about demographics such as age, sex/gender, race, ethnicity, but also tumor characteristics, will be considered as covariates and used in the CATE estimation.
Other Variables of Interest that will be used in your analysis and how they will be categorized/defined for your study:
We will consider a range of baseline covariates as additional variables for descriptive and effect modifier analysis.
These e.g. include:
- Demographics: Age (continuous), sex (male/female), and race (categorized as White, Black or African American, Asian, and Other).
- Disease characteristics: ECOG performance status (0--1 vs >=2), number of prior therapies (categorical), cytogenetic risk profile (standard vs high risk), and ISS stage (I, II, III).
- Interventions: "Placebo + Abiraterone Acetate + Prednisolone" vs. "Apalutamide + Abiraterone Acetate + Prednisolone" (categorized as Control / Treated).
The demographics and disease characteristics will be used for confounder adjustment and to characterize subgroups in which treatment effects may differ.
Statistical Analysis Plan:
Descriptive analysis:
We begin by summarizing key features of the patient population. This includes computing median, mean, minimum, and maximum survival times across clinically relevant subgroups (e.g., age groups, biomarker levels, tumor stages). These statistics help characterize outcome variability and identify broad patterns.
CATE estimation with bounds:
To handle incomplete survival data, we estimate the CATE in addition to the average treatment effect (ATE) to understand treatment effect heterogeneity in the population. We further use partial identification to obtain bounds on the CATE in the presence of censoring. Specifically, our method uses a doubly robust approach and efficient influence function theory to estimate bounds and ensure valid results even when data are partially missing or models are misspecified.
Subgroup evaluation:
We evaluate treatment effects across subgroups defined by clinical characteristics. For each subgroup, we calculate the CATE bounds and identify those with consistently positive lower bounds, indicating benefit from treatment. This helps prioritize subpopulations most likely to respond to an anti-cancer drug.
All analyses will be done in R / Python within a secure environment.
Narrative Summary:
Patients with cancer often respond differently to the same treatment, thus pointing to the potential benefit of personalized care. Clinical trial data can help identify patient groups likely to benefit, but this data is often incomplete due to early treatment stops or missing follow-up. We aim to predict the individual-level effectiveness using state-of-the-art methods that can handle such incomplete data while uncovering meaningful treatment response patterns. By showing which patients benefit more or less, we aim to support evidence generation for personalizing treatment strategies and identifying predictive biomarkers (e.g., tumor characteristics) that predict treatment response. Altogether, our work will contribute to guiding more effective, personalized cancer treatments in real-world settings.
Project Timeline:
Start of project: July 2025
Completion of data cleaning and descriptive analysis: July 2025
Implementation of main methods: Done (Codes are finished and tested reliably on synthetic data)
Evaluation and statistical interpretation: August 2025
Draft manuscript ready: August 2025
First submission for publication: August 2025
Report results back to YODA: September 2025
The entire project is expected to be completed within 3 months, but we ask for access for a 12-month period to accommodate potential updates during the revision process.
Dissemination Plan:
We plan to submit one or two manuscripts to high-impact journals such as Nature Biomedical Engineering, PNAS, or Nature Communications. Our target audience includes researchers in medicine, epidemiology, and oncology, particularly those interested in CATE estimation and cancer trial data applications in survival analysis.
Bibliography:
- Feuerriegel, Stefan, Dennis Frauen, Valentyn Melnychuk, Jonas Schweisthal, Konstantin Hess, Alicia Curth, Stefan Bauer, Niki Kilbertus, Isaac S. Kohane, and Mihaela van der Schaar. Causal machine learning for predicting treatment outcomes. Nature Medicine 30, no. 4: 958-968, 2024.
- John P. Klein and Melvin L. Moeschberger. Survival Analysis: Techniques for censored and truncated data. Springer Science & Business Media, 2006.
- Weijia Zhang, Thuc Duy Le, Lin Liu, Zhi-Hua Zhou, and Jiuyong Li. Mining heterogeneous causal effects for personalized cancer treatment. Bioinformatics, 33(15):2372--2378, 2017.
- Simon Wiegrebe, Philipp Kopper, Raphael Sonabend, Bernd Bischl, and Andreas Bender. Deep learning for survival analysis: a review. Artificial Intelligence Review 57, no. 3: 65, 2024.
- Tulika Rudra Gupta, Daniel Schwartz, Riddhiman Saha, Riddhiman Saha, , and . Informative censoring in externally controlled clinical trials: A potential source of bias. ESMO Open, 10(1):104094, 2025.
- Zhiqiang Tan. A distributional approach for causal inference using propensity scores. Journal of the American Statistical Association, 101(476):1619--1637, 2006.
- Alicia Curth, Changhee Lee, and Mihaela van der Schaar. Survite: Learning heterogeneous treatment effects from time-to-event data. In NeurIPS, 2021.
- Zijun Gao and Trevor Hastie. Estimating heterogeneous treatment effects for general responses, arXiv preprint, arXiv:2103.04277, 2021.
- Yizhe Xu, Nikolaos Ignatiadis, Erik Sverdrup, Scott Fleming, Stefan Wager, and Nigam Shah. Treatment heterogeneity for survival outcomes, arXiv preprint, arXiv:2103.04277, 2022.
- Shenbo Xu, Raluca Cobzaru, Stan N. Finkelstein, Roy E. Welsch, Kenney Ng, and Zach Shahn. Estimating heterogeneous treatment effects on survival outcomes using counterfactual censoring unbiased transformations, arXiv preprint, arXiv:2401.11263, 2024.