2025-0508 - The YODA Project

                    array(41) {
  ["request_overridden_res"]=>
  string(1) "3"
  ["project_status"]=>
  string(7) "ongoing"
  ["project_assoc_trials"]=>
  array(3) {
    [0]=>
    object(WP_Post)#5596 (24) {
      ["ID"]=>
      int(8008)
      ["post_author"]=>
      string(4) "1363"
      ["post_date"]=>
      string(19) "2023-08-05 04:44:39"
      ["post_date_gmt"]=>
      string(19) "2023-08-05 04:44:39"
      ["post_content"]=>
      string(0) ""
      ["post_title"]=>
      string(210) "NCT02489318 - A Phase 3 Randomized, Placebo-controlled, Double-blind Study of Apalutamide Plus Androgen Deprivation Therapy (ADT) Versus ADT in Subjects With Metastatic Hormone-sensitive Prostate Cancer (mHSPC)"
      ["post_excerpt"]=>
      string(0) ""
      ["post_status"]=>
      string(7) "publish"
      ["comment_status"]=>
      string(6) "closed"
      ["ping_status"]=>
      string(6) "closed"
      ["post_password"]=>
      string(0) ""
      ["post_name"]=>
      string(194) "nct02489318-a-phase-3-randomized-placebo-controlled-double-blind-study-of-apalutamide-plus-androgen-deprivation-therapy-adt-versus-adt-in-subjects-with-metastatic-hormone-sensitive-prostate-canc"
      ["to_ping"]=>
      string(0) ""
      ["pinged"]=>
      string(0) ""
      ["post_modified"]=>
      string(19) "2025-04-30 16:20:21"
      ["post_modified_gmt"]=>
      string(19) "2025-04-30 20:20:21"
      ["post_content_filtered"]=>
      string(0) ""
      ["post_parent"]=>
      int(0)
      ["guid"]=>
      string(243) "https://dev-yoda.pantheonsite.io/clinical-trial/nct02489318-a-phase-3-randomized-placebo-controlled-double-blind-study-of-apalutamide-plus-androgen-deprivation-therapy-adt-versus-adt-in-subjects-with-metastatic-hormone-sensitive-prostate-canc/"
      ["menu_order"]=>
      int(0)
      ["post_type"]=>
      string(14) "clinical_trial"
      ["post_mime_type"]=>
      string(0) ""
      ["comment_count"]=>
      string(1) "0"
      ["filter"]=>
      string(3) "raw"
    }
    [1]=>
    object(WP_Post)#5597 (24) {
      ["ID"]=>
      int(1845)
      ["post_author"]=>
      string(4) "1363"
      ["post_date"]=>
      string(19) "2019-12-12 12:23:00"
      ["post_date_gmt"]=>
      string(19) "2019-12-12 12:23:00"
      ["post_content"]=>
      string(0) ""
      ["post_title"]=>
      string(257) "NCT01715285 - A Randomized, Double-blind, Comparative Study of Abiraterone Acetate Plus Low-Dose Prednisone Plus Androgen Deprivation Therapy (ADT) Versus ADT Alone in Newly Diagnosed Subjects With High-Risk, Metastatic Hormone-naive Prostate Cancer (mHNPC)"
      ["post_excerpt"]=>
      string(0) ""
      ["post_status"]=>
      string(7) "publish"
      ["comment_status"]=>
      string(6) "closed"
      ["ping_status"]=>
      string(6) "closed"
      ["post_password"]=>
      string(0) ""
      ["post_name"]=>
      string(193) "nct01715285-a-randomized-double-blind-comparative-study-of-abiraterone-acetate-plus-low-dose-prednisone-plus-androgen-deprivation-therapy-adt-versus-adt-alone-in-newly-diagnosed-subjects-with-h"
      ["to_ping"]=>
      string(0) ""
      ["pinged"]=>
      string(0) ""
      ["post_modified"]=>
      string(19) "2025-04-30 16:02:44"
      ["post_modified_gmt"]=>
      string(19) "2025-04-30 20:02:44"
      ["post_content_filtered"]=>
      string(0) ""
      ["post_parent"]=>
      int(0)
      ["guid"]=>
      string(242) "https://dev-yoda.pantheonsite.io/clinical-trial/nct01715285-a-randomized-double-blind-comparative-study-of-abiraterone-acetate-plus-low-dose-prednisone-plus-androgen-deprivation-therapy-adt-versus-adt-alone-in-newly-diagnosed-subjects-with-h/"
      ["menu_order"]=>
      int(0)
      ["post_type"]=>
      string(14) "clinical_trial"
      ["post_mime_type"]=>
      string(0) ""
      ["comment_count"]=>
      string(1) "0"
      ["filter"]=>
      string(3) "raw"
    }
    [2]=>
    object(WP_Post)#5598 (24) {
      ["ID"]=>
      int(1810)
      ["post_author"]=>
      string(4) "1363"
      ["post_date"]=>
      string(19) "2019-08-12 15:13:00"
      ["post_date_gmt"]=>
      string(19) "2019-08-12 15:13:00"
      ["post_content"]=>
      string(0) ""
      ["post_title"]=>
      string(171) "NCT02236637 - A Prospective Registry of Patients With a Confirmed Diagnosis of Adenocarcinoma of the Prostate Presenting With Metastatic Castrate-Resistant Prostate Cancer"
      ["post_excerpt"]=>
      string(0) ""
      ["post_status"]=>
      string(7) "publish"
      ["comment_status"]=>
      string(6) "closed"
      ["ping_status"]=>
      string(6) "closed"
      ["post_password"]=>
      string(0) ""
      ["post_name"]=>
      string(169) "nct02236637-a-prospective-registry-of-patients-with-a-confirmed-diagnosis-of-adenocarcinoma-of-the-prostate-presenting-with-metastatic-castrate-resistant-prostate-cancer"
      ["to_ping"]=>
      string(0) ""
      ["pinged"]=>
      string(0) ""
      ["post_modified"]=>
      string(19) "2025-04-30 15:44:32"
      ["post_modified_gmt"]=>
      string(19) "2025-04-30 19:44:32"
      ["post_content_filtered"]=>
      string(0) ""
      ["post_parent"]=>
      int(0)
      ["guid"]=>
      string(218) "https://dev-yoda.pantheonsite.io/clinical-trial/nct02236637-a-prospective-registry-of-patients-with-a-confirmed-diagnosis-of-adenocarcinoma-of-the-prostate-presenting-with-metastatic-castrate-resistant-prostate-cancer/"
      ["menu_order"]=>
      int(0)
      ["post_type"]=>
      string(14) "clinical_trial"
      ["post_mime_type"]=>
      string(0) ""
      ["comment_count"]=>
      string(1) "0"
      ["filter"]=>
      string(3) "raw"
    }
  }
  ["project_title"]=>
  string(131) "The Use of QSAR Machine Learning Methods in Predicting Ultra-Low PSA Responses in Patients treated with Abiraterone and Apalutamide"
  ["project_narrative_summary"]=>
  string(875) "This project, "The Use of QSAR Machine Learning Methods in Predicting Ultra-Low PSA Responses in Patients treated with Abiraterone and Apalutamide," aims to predict ultra-low PSA responses in patients receiving abiraterone and apalutamide.



We use QSAR (Quantitative Structure-Activity Relationship) machine learning methods as powerful classification tools. This approach analyzes comprehensive patient data—like clinical variables and treatment histories—to identify patterns. The goal is to build robust models that can classify patients into those likely to achieve an ultra-low PSA response or not.



These models will provide clinicians an invaluable tool, enabling optimized treatment strategies and personalized patient care. By predicting optimal responders, we can improve clinical decision-making, leading to better patient outcomes."
  ["project_learn_source"]=>
  string(12) "scien_public"
  ["principal_investigator"]=>
  array(7) {
    ["first_name"]=>
    string(13) "Miguel Ángel"
    ["last_name"]=>
    string(12) "Gómez-Luque"
    ["degree"]=>
    string(4) "M.D."
    ["primary_affiliation"]=>
    string(37) "Virgen del Rocío University Hospital"
    ["email"]=>
    string(23) "ma.gomezluque@gmail.com"
    ["state_or_province"]=>
    string(7) "Seville"
    ["country"]=>
    string(5) "Spain"
  }
  ["project_key_personnel"]=>
  bool(false)
  ["project_ext_grants"]=>
  array(2) {
    ["value"]=>
    string(2) "no"
    ["label"]=>
    string(68) "No external grants or funds are being used to support this research."
  }
  ["project_date_type"]=>
  string(18) "full_crs_supp_docs"
  ["property_scientific_abstract"]=>
  string(1278) "Background: Ultra-low PSA levels are vital for prostate cancer prognosis on novel hormonal therapies. Predicting optimal abiraterone/apalutamide responses is a crucial clinical challenge; accurate forecasts enhance treatment and outcomes.



Objective: Develop and validate QSAR-based machine learning models to predict ultra-low PSA responses in prostate cancer patients treated with abiraterone/apalutamide.



Study Design: Retrospective, observational study using existing, de-identified patient data. Employs advanced QSAR-based ML for classification.



Participants: Prostate cancer patients treated with abiraterone, apalutamide, or both, with comprehensive clinical data, treatment histories, and PSA response measurements.



Primary/Secondary Outcome(s): Primary: Predicting ultra-low PSA response (≤ 0.2 ng/mL) post-treatment. Secondary: Identifying key patient/clinical variables; evaluating model performance (accuracy, precision, recall, F1-score, AUC).



Statistical Analysis: ML techniques (e.g., SVM, random forests, neural networks) for model development/validation. Data preprocessing, feature selection, cross-validation for robustness. Performance assessed via standard classification metrics."
  ["project_brief_bg"]=>
  string(3229) "Background: Prostate cancer remains a leading cause of morbidity and mortality worldwide. Novel hormonal therapies, such as abiraterone and apalutamide, have significantly improved outcomes for patients. A critical indicator of favorable prognosis and sustained disease control is the achievement of an ultra-low Prostate-Specific Antigen (PSA) response (typically defined as PSA ≤ 0.2 ng/mL). However, current clinical practice lacks robust, individualized tools to accurately predict which patients will achieve these optimal responses to abiraterone and apalutamide. This uncertainty often leads to empirical treatment decisions, potentially delaying optimal care or exposing patients to unnecessary side effects from less effective therapies. There is a clear unmet need for predictive models that can stratify patients based on their likelihood of achieving this highly desirable outcome.



Statement of Project Significance: This research project holds significant scientific and medical importance by directly addressing the aforementioned clinical challenge. By applying Quantitative Structure-Activity Relationship (QSAR) machine learning methods – adapted here to identify complex patterns within comprehensive patient data – we aim to develop highly accurate predictive models for ultra-low PSA responses.



The information gained from this work will materially enhance generalizable scientific and medical knowledge in several ways:



Novel Predictive Framework: We will establish a novel, data-driven framework for predicting treatment response in prostate cancer, moving beyond traditional statistical approaches. This framework, rooted in QSAR principles for pattern recognition, can be generalized to other disease contexts and drug combinations.



Identification of Key Predictors: The models will not only predict outcomes but also help identify and rank the most influential patient characteristics and clinical variables associated with achieving an ultra-low PSA response. This knowledge can inform future biomarker discovery and deepen our understanding of treatment mechanisms.



Personalized Medicine Advancement: By enabling the early identification of patients most likely to benefit from abiraterone and apalutamide, this work will directly contribute to the advancement of personalized oncology. It will allow clinicians to make more informed, evidence-based decisions, tailoring treatment strategies to individual patient profiles.



This work will inform science and public health by:



Optimizing Clinical Practice: Providing a tool that can guide therapeutic choices, potentially reducing the time to effective treatment and minimizing exposure to ineffective regimens.



Improving Patient Outcomes: Leading to better patient stratification, which can translate into improved quality of life, prolonged disease control, and potentially increased survival rates for prostate cancer patients.



Resource Optimization: Helping healthcare systems allocate resources more efficiently by ensuring that costly therapies are directed towards patients most likely to benefit.

"
  ["project_specific_aims"]=>
  string(1309) "This project aims to leverage advanced machine learning, grounded in QSAR principles, to predict ultra-low PSA responses in prostate cancer patients treated with abiraterone and apalutamide. We hypothesize that patient-specific clinical and treatment data can be used to accurately classify responders.



Aim 1: Develop and validate a robust QSAR-based machine learning model to predict ultra-low PSA response.



Objective: To construct a classification model capable of accurately predicting whether a patient will achieve an ultra-low PSA response (≤ 0.2 ng/mL) after abiraterone and/or apalutamide treatment.



Hypothesis: A machine learning model, trained on diverse patient data, can predict ultra-low PSA responses with high sensitivity and specificity.



Aim 2: Identify key patient and clinical variables driving ultra-low PSA response prediction.



Objective: To determine which specific patient characteristics (e.g., demographics, disease features, prior treatments) are most influential in the predictive model for achieving an ultra-low PSA response.



Hypothesis: A subset of readily available patient and clinical variables will significantly contribute to the model's predictive power for ultra-low PSA responses."
  ["project_study_design"]=>
  array(2) {
    ["value"]=>
    string(5) "other"
    ["label"]=>
    string(5) "Other"
  }
  ["project_study_design_exp"]=>
  string(63) " Observational Study using existing, de-identified patient data"
  ["project_purposes"]=>
  array(5) {
    [0]=>
    array(2) {
      ["value"]=>
      string(76) "confirm_or_validate previously_conducted_research_on_treatment_effectiveness"
      ["label"]=>
      string(76) "Confirm or validate previously conducted research on treatment effectiveness"
    }
    [1]=>
    array(2) {
      ["value"]=>
      string(22) "participant_level_data"
      ["label"]=>
      string(36) "Participant-level data meta-analysis"
    }
    [2]=>
    array(2) {
      ["value"]=>
      string(37) "participant_level_data_only_from_yoda"
      ["label"]=>
      string(51) "Meta-analysis using only data from the YODA Project"
    }
    [3]=>
    array(2) {
      ["value"]=>
      string(37) "develop_or_refine_statistical_methods"
      ["label"]=>
      string(37) "Develop or refine statistical methods"
    }
    [4]=>
    array(2) {
      ["value"]=>
      string(50) "research_on_clinical_prediction_or_risk_prediction"
      ["label"]=>
      string(50) "Research on clinical prediction or risk prediction"
    }
  }
  ["project_research_methods"]=>
  string(1986) "Our primary data source will be an existing, de-identified patient dataset obtained from a well-characterized clinical cohort or a secure research registry. Should data from trials made available through the YODA Project become relevant and accessible, the following explicit inclusion/exclusion criteria would be applied to define our study sample, alongside any other utilized datasets:



Inclusion Criteria:



Histologically confirmed diagnosis of prostate cancer.



Received treatment with abiraterone and/or apalutamide.



Availability of comprehensive baseline clinical and demographic characteristics (e.g., age, race, ECOG performance status, prior therapies, baseline PSA, disease stage, Gleason score).



Availability of serial PSA measurements collected post-treatment, sufficient to assess the primary outcome of ultra-low PSA response (≤ 0.2 ng/mL).



Exclusion Criteria:



Missing critical baseline demographic or clinical data essential for populating model inputs.



Incomplete PSA follow-up that prevents reliable assessment of the primary outcome (e.g., very early treatment discontinuation without adequate follow-up for response determination).



Patients with significant co-morbidities or active second primary malignancies expected to preclude adequate follow-up or impact PSA response assessment unrelated to prostate cancer therapy.



Data quality issues that render key variables unreliable for analysis.



If data from studies other than those potentially accessed via YODA are used (e.g., from collaborating academic institutions), their respective data dictionaries will be rigorously harmonized to ensure consistency across variables. We plan to conduct individual patient data (IPD) analysis by pooling these harmonized datasets on a secure computational platform, enabling a comprehensive and robust analysis."
  ["project_main_outcome_measure"]=>
  string(1980) "Primary Outcome Measure

The primary outcome measure is the prediction of an ultra-low Prostate-Specific Antigen (PSA) response following treatment with abiraterone and/or apalutamide. This is strictly defined as:



Ultra-low PSA Response: A confirmed PSA level of ≤ 0.2 ng/mL achieved at any point after treatment initiation, maintained for at least two consecutive measurements or until progression. Patients not meeting this criteria are classified as "No Ultra-low PSA Response." This binary classification is our machine learning model's direct target.



Secondary Outcome Measures

We'll also evaluate secondary outcome measures for a comprehensive understanding of response and model performance:



Identification of Key Predictive Variables: This involves pinpointing the most influential patient characteristics and clinical variables (e.g., baseline PSA, Gleason score, age) contributing to the model's ability to predict ultra-low PSA response. This will be assessed via feature importance analyses from trained models.



Model Performance Metrics: We'll rigorously quantify our models' predictive power using standard classification metrics:



Accuracy: Proportion of correctly classified instances.



Precision (Positive Predictive Value): Correctly predicted ultra-low PSA responders among all predicted.



Recall (Sensitivity): Correctly predicted ultra-low PSA responders among all actual.



F1-score: Harmonic mean of precision and recall.



Area Under the Receiver Operating Characteristic Curve (AUC-ROC): Model's ability to distinguish between classes.



Changes to Outcome Measures in Final Analysis

We anticipate no changes to these clearly defined primary and secondary outcome measures in the final analysis reported in our publication. Their definitions are designed for direct comparability and clarity."
  ["project_main_predictor_indep"]=>
  string(1599) "In this study, our main independent variables are a comprehensive set of patient-specific clinical and demographic characteristics, alongside treatment-related factors. These serve as inputs for our QSAR-based machine learning models, which aim to predict the ultra-low PSA response. We are assessing the collective predictive power of these diverse features, rather than a single variable.



Variables will be extracted from the patient dataset and defined for consistency and comparability:



Patient Demographic and Baseline Clinical Characteristics

Age at Treatment Initiation: Continuous (years).



Race/Ethnicity: Categorical (e.g., White, Black, Asian; based on data).



ECOG Performance Status (PS): Categorical (e.g., 0, 1, 2).



Baseline PSA Level: Continuous (ng/mL), pre-treatment.



Gleason Score at Diagnosis: Categorical (e.g., ≤ 6, 7 (3+4), 7 (4+3), ≥ 8).



Disease Stage: Categorical (e.g., localized, metastatic; per common staging systems).



Prior Treatments: Binary/Categorical for prior therapies (e.g., chemotherapy, ADT, radiation).



Prior Antiandrogen Use: Binary (Yes/No).



Lactate Dehydrogenase (LDH) Level: Continuous (if available).



Alkaline Phosphatase (ALP) Level: Continuous (if available).



Treatment-Related Factors

Drug Received: Categorical (Abiraterone, Apalutamide, Both/Sequential).



Treatment Duration: Continuous (months), until response assessment/discontinuation."
  ["project_other_variables_interest"]=>
  string(1957) "Beyond the primary predictors, several other variables will be used in our analysis to thoroughly characterize the study sample and for potential multivariable risk adjustment or deeper exploratory analysis. These variables, while not direct inputs for the primary predictive task, offer valuable context and allow for subgroup analyses or sensitivity testing.



Disease Progression and Survival Outcomes

Clinical Progression: Binary variable (Yes/No), indicating progression based on RECIST criteria (for measurable disease), new bone lesions on imaging, or significant clinical worsening. Time to clinical progression (continuous, in months) will also be recorded.



PSA Progression: Binary variable (Yes/No), indicating PSA progression as per PCWG3 criteria (e.g., ≥ 25% increase and ≥ 2 ng/mL absolute increase over nadir or baseline, confirmed). Time to PSA progression (continuous, in months) will be recorded.



Overall Survival (OS): Binary variable (Yes/No, indicating death from any cause) and continuous variable (in months from treatment initiation to death or last follow-up).



Progression-Free Survival (PFS): Continuous variable (in months from treatment initiation to progression [clinical or PSA, whichever comes first] or death from any cause).



Concomitant Medications

Steroid Use (e.g., Prednisone): Binary variable (Yes/No), indicating concomitant use during abiraterone treatment.



Bone-Targeting Agents: Binary variable (Yes/No), indicating concomitant use of agents like denosumab or zoledronic acid.



Laboratory Parameters (Longitudinal or at Specific Time Points)

Hemoglobin: Continuous (g/dL).



Creatinine: Continuous (mg/dL or µmol/L).



Liver Enzymes (ALT, AST): Continuous (U/L).



Testosterone Level (during treatment): Continuous (ng/dL or nmol/L).



"
  ["project_stat_analysis_plan"]=>
  string(3773) "Our statistical analysis plan integrates comprehensive traditional methods with advanced machine learning techniques to thoroughly analyze the patient data and achieve the study objectives.



1. Data Preprocessing and Management

Initial steps will involve rigorous data cleaning, including identification and handling of outliers and inconsistencies. Missing data will be addressed using appropriate imputation strategies (e.g., multiple imputation, k-Nearest Neighbors), with sensitivity analyses performed to evaluate the impact of chosen methods. Variables will undergo necessary feature engineering and transformations (e.g., log-transformations for skewed distributions, one-hot encoding for categorical variables) to optimize model input. The dataset will be randomly partitioned into training, validation (for hyperparameter tuning), and independent test sets to ensure robust and unbiased model evaluation.



2. Descriptive Analyses

We will provide a detailed summary of the study cohort's baseline demographic, clinical, and treatment characteristics. Continuous variables will be reported as mean (standard deviation) or median (interquartile range), as appropriate. Categorical variables will be presented as frequencies and percentages. The overall proportion of patients achieving an ultra-low PSA response within the cohort will be explicitly characterized.



3. Bivariate Analyses

Associations between individual independent variables and the primary outcome (ultra-low PSA response) will be assessed. For categorical predictors, we will employ Chi-square tests or Fisher's exact tests. For continuous predictors, independent t-tests or Mann-Whitney U tests will be utilized. Correlation analyses (e.g., Pearson, Spearman) will examine relationships between continuous variables.



4. Advanced Analyses: QSAR-Based Machine Learning for Prediction (Main Focus)

The core of our analysis involves developing and validating predictive machine learning models for ultra-low PSA response:



Model Selection: We will explore and compare several supervised machine learning classification algorithms adapted for QSAR principles in patient data. Candidate models include Support Vector Machines (SVMs), Random Forests, Gradient Boosting Machines (e.g., XGBoost, LightGBM), and Neural Networks. Logistic Regression will serve as a baseline comparator.



Feature Selection: Techniques such as Recursive Feature Elimination (RFE) or tree-based feature importance methods will be applied to identify the most relevant predictors, reduce dimensionality, and enhance model interpretability and generalization.



Model Training and Hyperparameter Tuning: Models will be trained on the designated training set. Optimal model hyperparameters will be identified through systematic cross-validation (e.g., 5-fold or 10-fold) using methods like Grid Search or Bayesian Optimization on the training/validation sets.



Model Evaluation: Performance will be rigorously assessed on the independent test set using the predefined primary and secondary outcome metrics. Key metrics include Accuracy, Precision, Recall, F1-score, and Area Under the Receiver Operating Characteristic Curve (AUC-ROC). Calibration plots will assess agreement between predicted probabilities and observed outcomes. Confusion matrices will provide detailed insight into classification errors.



Model Interpretability: To understand the contribution of individual features to model predictions, especially for complex "black-box" models, methods like SHAP (SHapley Additive exPlanations) or LIME (Local Interpretable Model-agnostic Explanations) will be employed."
  ["project_software_used"]=>
  array(2) {
    [0]=>
    array(2) {
      ["value"]=>
      string(6) "python"
      ["label"]=>
      string(6) "Python"
    }
    [1]=>
    array(2) {
      ["value"]=>
      string(7) "rstudio"
      ["label"]=>
      string(7) "RStudio"
    }
  }
  ["project_timeline"]=>
  string(1651) "Here's an estimated timeline for the key milestones of this study. This plan assumes data access is granted promptly and considers the 12-month access period, with potential for extension.



Anticipated Project Start Date:



October 1, 2025 (Upon Data Access Approval)



Data Processing & Initial Analysis Completion:



February 28, 2026 (Approximately 5 months from start)



This phase includes data cleaning, preprocessing, imputation, and preliminary descriptive/bivariate analyses.



Machine Learning Model Development & Validation Completion:



June 30, 2026 (Approximately 9 months from start)



This covers feature engineering, model selection, training, hyperparameter tuning, rigorous validation on independent test sets, and interpretability analysis.



Manuscript Drafted and First Submitted for Publication:



September 30, 2026 (Approximately 12 months from start)



This includes full results interpretation, writing the manuscript, internal reviews, and preparing for journal submission.



Date Results Reported Back to the YODA Project:



October 31, 2026 (Within 13 months of start)



A summary of key findings and the final manuscript will be provided as required by the Data Use Agreement.



This timeline is ambitious but achievable, leveraging existing methodologies and focusing on rapid execution once data access is secured. Should an extension be needed, it would primarily affect the publication and final reporting dates."
  ["project_dissemination_plan"]=>
  string(1601) "Our comprehensive dissemination plan aims to share the study's findings with both scientific and clinical communities, ensuring maximum impact and contribution to medical knowledge and public health.



Anticipated Products

The primary product of this research will be a peer-reviewed scientific manuscript. This manuscript will detail the project's methodology, the performance of the developed QSAR-based machine learning models, the identified key predictive variables for ultra-low PSA response, and the clinical implications of our findings. We may also explore presenting selected results at major international scientific conferences focused on oncology, urology, or machine learning in healthcare. Depending on the depth and breadth of the findings, secondary products could include supplementary methodological reports or data visualization tools that further explain the models.



Target Audiences

Our research targets several key audiences:



Oncologists and Urologists: Physicians specializing in prostate cancer treatment, who will benefit from predictive tools to optimize patient selection for abiraterone and apalutamide.



Medical Researchers: Scientists involved in prostate cancer biology, drug development, and precision medicine, who can build upon our findings regarding predictive biomarkers and machine learning applications.



Methodologists and Data Scientists: Researchers interested in the application of QSAR principles and advanced machine learning techniques in clinical data.



"
  ["project_bibliography"]=>
  string(1900) "
Luque Ruiz, I., & Gómez-Nieto, M. Á. (2018). QSAR classification and regression models for β-secretase inhibitors using relative distance matrices. SAR and QSAR in environmental research, 29(5), 355–383. https://doi.org/10.1080/1062936X.2018.1442879


Luque Ruiz, I., & Gómez-Nieto, M. Á. (2018). Study of Data Set Modelability: Modelability, Rivality, and Weighted Modelability Indexes. Journal of chemical information and modeling, 58(9), 1798–1814. https://doi.org/10.1021/acs.jcim.8b001883.

3. Luque Ruiz, I., & Gómez-Nieto, M. Á. (2018). Regression Modelability Index: A New Index for Prediction of the Modelability of Data Sets in the Development of QSAR Regression Models. Journal of chemical information and modeling, 58(10), 2069–2084. https://doi.org/10.1021/acs.jcim.8b00313

4. Wang, T., Yuan, X. S., Wu, M. B., Lin, J. P., & Yang, L. R. (2017). The advancement of multidimensional QSAR for novel drug discovery – where are we headed?. Expert opinion on drug discovery, 12(8), 769–784. https://doi.org/10.1080/17460441.2017.1336157



5. Lee, C., Light, A., Alaa, A., Thurtle, D., van der Schaar, M., & Gnanapragasam, V. J. (2021). Application of a novel machine learning framework for predicting non-metastatic prostate cancer-specific mortality in men using the Surveillance, Epidemiology, and End Results (SEER) database. The Lancet. Digital health, 3(3), e158–e165. https://doi.org/10.1016/S2589-7500(20)30314-9








"
  ["project_suppl_material"]=>
  bool(false)
  ["project_coi"]=>
  array(1) {
    [0]=>
    array(1) {
      ["file_coi"]=>
      array(21) {
        ["ID"]=>
        int(17602)
        ["id"]=>
        int(17602)
        ["title"]=>
        string(9) "Apply.pdf"
        ["filename"]=>
        string(9) "Apply.pdf"
        ["filesize"]=>
        int(20943)
        ["url"]=>
        string(58) "https://yoda.yale.edu/wp-content/uploads/2025/07/Apply.pdf"
        ["link"]=>
        string(55) "https://yoda.yale.edu/data-request/2025-0508/apply-pdf/"
        ["alt"]=>
        string(0) ""
        ["author"]=>
        string(4) "2156"
        ["description"]=>
        string(0) ""
        ["caption"]=>
        string(0) ""
        ["name"]=>
        string(9) "apply-pdf"
        ["status"]=>
        string(7) "inherit"
        ["uploaded_to"]=>
        int(17601)
        ["date"]=>
        string(19) "2025-07-14 18:20:29"
        ["modified"]=>
        string(19) "2025-07-14 18:20:31"
        ["menu_order"]=>
        int(0)
        ["mime_type"]=>
        string(15) "application/pdf"
        ["type"]=>
        string(11) "application"
        ["subtype"]=>
        string(3) "pdf"
        ["icon"]=>
        string(62) "https://yoda.yale.edu/wp/wp-includes/images/media/document.png"
      }
    }
  }
  ["data_use_agreement_training"]=>
  bool(true)
  ["human_research_protection_training"]=>
  bool(true)
  ["certification"]=>
  bool(true)
  ["search_order"]=>
  string(1) "0"
  ["project_send_email_updates"]=>
  bool(false)
  ["project_publ_available"]=>
  bool(true)
  ["project_year_access"]=>
  string(0) ""
  ["project_rep_publ"]=>
  bool(false)
  ["project_assoc_data"]=>
  array(0) {
  }
  ["project_due_dil_assessment"]=>
  array(21) {
    ["ID"]=>
    int(17899)
    ["id"]=>
    int(17899)
    ["title"]=>
    string(47) "YODA Project Due Diligence Assessment 2025-0508"
    ["filename"]=>
    string(51) "YODA-Project-Due-Diligence-Assessment-2025-0508.pdf"
    ["filesize"]=>
    int(128667)
    ["url"]=>
    string(100) "https://yoda.yale.edu/wp-content/uploads/2025/07/YODA-Project-Due-Diligence-Assessment-2025-0508.pdf"
    ["link"]=>
    string(93) "https://yoda.yale.edu/data-request/2025-0508/yoda-project-due-diligence-assessment-2025-0508/"
    ["alt"]=>
    string(0) ""
    ["author"]=>
    string(4) "1885"
    ["description"]=>
    string(0) ""
    ["caption"]=>
    string(0) ""
    ["name"]=>
    string(47) "yoda-project-due-diligence-assessment-2025-0508"
    ["status"]=>
    string(7) "inherit"
    ["uploaded_to"]=>
    int(17601)
    ["date"]=>
    string(19) "2025-09-15 15:24:01"
    ["modified"]=>
    string(19) "2025-09-15 15:24:01"
    ["menu_order"]=>
    int(0)
    ["mime_type"]=>
    string(15) "application/pdf"
    ["type"]=>
    string(11) "application"
    ["subtype"]=>
    string(3) "pdf"
    ["icon"]=>
    string(62) "https://yoda.yale.edu/wp/wp-includes/images/media/document.png"
  }
  ["project_title_link"]=>
  array(21) {
    ["ID"]=>
    int(17901)
    ["id"]=>
    int(17901)
    ["title"]=>
    string(46) "YODA Project Protocol - 2025-0508 - 2025-07-14"
    ["filename"]=>
    string(46) "YODA-Project-Protocol-2025-0508-2025-07-14.pdf"
    ["filesize"]=>
    int(136841)
    ["url"]=>
    string(95) "https://yoda.yale.edu/wp-content/uploads/2025/07/YODA-Project-Protocol-2025-0508-2025-07-14.pdf"
    ["link"]=>
    string(88) "https://yoda.yale.edu/data-request/2025-0508/yoda-project-protocol-2025-0508-2025-07-14/"
    ["alt"]=>
    string(0) ""
    ["author"]=>
    string(4) "1885"
    ["description"]=>
    string(0) ""
    ["caption"]=>
    string(0) ""
    ["name"]=>
    string(42) "yoda-project-protocol-2025-0508-2025-07-14"
    ["status"]=>
    string(7) "inherit"
    ["uploaded_to"]=>
    int(17601)
    ["date"]=>
    string(19) "2025-09-15 15:28:06"
    ["modified"]=>
    string(19) "2025-09-15 15:28:06"
    ["menu_order"]=>
    int(0)
    ["mime_type"]=>
    string(15) "application/pdf"
    ["type"]=>
    string(11) "application"
    ["subtype"]=>
    string(3) "pdf"
    ["icon"]=>
    string(62) "https://yoda.yale.edu/wp/wp-includes/images/media/document.png"
  }
  ["project_review_link"]=>
  array(21) {
    ["ID"]=>
    int(17900)
    ["id"]=>
    int(17900)
    ["title"]=>
    string(36) "YODA Project Review - 2025-0508_site"
    ["filename"]=>
    string(38) "YODA-Project-Review-2025-0508_site.pdf"
    ["filesize"]=>
    int(1315618)
    ["url"]=>
    string(87) "https://yoda.yale.edu/wp-content/uploads/2025/07/YODA-Project-Review-2025-0508_site.pdf"
    ["link"]=>
    string(80) "https://yoda.yale.edu/data-request/2025-0508/yoda-project-review-2025-0508_site/"
    ["alt"]=>
    string(0) ""
    ["author"]=>
    string(4) "1885"
    ["description"]=>
    string(0) ""
    ["caption"]=>
    string(0) ""
    ["name"]=>
    string(34) "yoda-project-review-2025-0508_site"
    ["status"]=>
    string(7) "inherit"
    ["uploaded_to"]=>
    int(17601)
    ["date"]=>
    string(19) "2025-09-15 15:26:53"
    ["modified"]=>
    string(19) "2025-09-15 15:26:53"
    ["menu_order"]=>
    int(0)
    ["mime_type"]=>
    string(15) "application/pdf"
    ["type"]=>
    string(11) "application"
    ["subtype"]=>
    string(3) "pdf"
    ["icon"]=>
    string(62) "https://yoda.yale.edu/wp/wp-includes/images/media/document.png"
  }
  ["project_highlight_button"]=>
  string(0) ""
  ["request_data_partner"]=>
  string(0) ""
}
data partner
array(1) {
  [0]=>
  string(0) ""
}


pi country
array(0) {
}


pi affil
array(0) {
}


products
array(0) {
}


num of trials
array(1) {
  [0]=>
  string(1) "0"
}


res
array(1) {
  [0]=>
  string(1) "3"
}

General Information

How did you learn about the YODA Project?: Scientific Publication

Conflict of Interest

Apply.pdf

Request Clinical Trials

Associated Trial(s):

What type of data are you looking for?: Individual Participant-Level Data, which includes Full CSR and all supporting documentation

Request Clinical Trials

Data Request Status

Status: Ongoing

Research Proposal

Project Title: The Use of QSAR Machine Learning Methods in Predicting Ultra-Low PSA Responses in Patients treated with Abiraterone and Apalutamide

Scientific Abstract: Background: Ultra-low PSA levels are vital for prostate cancer prognosis on novel hormonal therapies. Predicting optimal abiraterone/apalutamide responses is a crucial clinical challenge; accurate forecasts enhance treatment and outcomes.

Objective: Develop and validate QSAR-based machine learning models to predict ultra-low PSA responses in prostate cancer patients treated with abiraterone/apalutamide.

Study Design: Retrospective, observational study using existing, de-identified patient data. Employs advanced QSAR-based ML for classification.

Participants: Prostate cancer patients treated with abiraterone, apalutamide, or both, with comprehensive clinical data, treatment histories, and PSA response measurements.

Primary/Secondary Outcome(s): Primary: Predicting ultra-low PSA response (<= 0.2 ng/mL) post-treatment. Secondary: Identifying key patient/clinical variables; evaluating model performance (accuracy, precision, recall, F1-score, AUC).

Statistical Analysis: ML techniques (e.g., SVM, random forests, neural networks) for model development/validation. Data preprocessing, feature selection, cross-validation for robustness. Performance assessed via standard classification metrics.

Brief Project Background and Statement of Project Significance: Background: Prostate cancer remains a leading cause of morbidity and mortality worldwide. Novel hormonal therapies, such as abiraterone and apalutamide, have significantly improved outcomes for patients. A critical indicator of favorable prognosis and sustained disease control is the achievement of an ultra-low Prostate-Specific Antigen (PSA) response (typically defined as PSA <= 0.2 ng/mL). However, current clinical practice lacks robust, individualized tools to accurately predict which patients will achieve these optimal responses to abiraterone and apalutamide. This uncertainty often leads to empirical treatment decisions, potentially delaying optimal care or exposing patients to unnecessary side effects from less effective therapies. There is a clear unmet need for predictive models that can stratify patients based on their likelihood of achieving this highly desirable outcome.

Statement of Project Significance: This research project holds significant scientific and medical importance by directly addressing the aforementioned clinical challenge. By applying Quantitative Structure-Activity Relationship (QSAR) machine learning methods -- adapted here to identify complex patterns within comprehensive patient data -- we aim to develop highly accurate predictive models for ultra-low PSA responses.

The information gained from this work will materially enhance generalizable scientific and medical knowledge in several ways:

Novel Predictive Framework: We will establish a novel, data-driven framework for predicting treatment response in prostate cancer, moving beyond traditional statistical approaches. This framework, rooted in QSAR principles for pattern recognition, can be generalized to other disease contexts and drug combinations.

Identification of Key Predictors: The models will not only predict outcomes but also help identify and rank the most influential patient characteristics and clinical variables associated with achieving an ultra-low PSA response. This knowledge can inform future biomarker discovery and deepen our understanding of treatment mechanisms.

Personalized Medicine Advancement: By enabling the early identification of patients most likely to benefit from abiraterone and apalutamide, this work will directly contribute to the advancement of personalized oncology. It will allow clinicians to make more informed, evidence-based decisions, tailoring treatment strategies to individual patient profiles.

This work will inform science and public health by:

Optimizing Clinical Practice: Providing a tool that can guide therapeutic choices, potentially reducing the time to effective treatment and minimizing exposure to ineffective regimens.

Improving Patient Outcomes: Leading to better patient stratification, which can translate into improved quality of life, prolonged disease control, and potentially increased survival rates for prostate cancer patients.

Resource Optimization: Helping healthcare systems allocate resources more efficiently by ensuring that costly therapies are directed towards patients most likely to benefit.

Specific Aims of the Project: This project aims to leverage advanced machine learning, grounded in QSAR principles, to predict ultra-low PSA responses in prostate cancer patients treated with abiraterone and apalutamide. We hypothesize that patient-specific clinical and treatment data can be used to accurately classify responders.

Aim 1: Develop and validate a robust QSAR-based machine learning model to predict ultra-low PSA response.

Objective: To construct a classification model capable of accurately predicting whether a patient will achieve an ultra-low PSA response (<= 0.2 ng/mL) after abiraterone and/or apalutamide treatment.

Hypothesis: A machine learning model, trained on diverse patient data, can predict ultra-low PSA responses with high sensitivity and specificity.

Aim 2: Identify key patient and clinical variables driving ultra-low PSA response prediction.

Objective: To determine which specific patient characteristics (e.g., demographics, disease features, prior treatments) are most influential in the predictive model for achieving an ultra-low PSA response.

Hypothesis: A subset of readily available patient and clinical variables will significantly contribute to the model's predictive power for ultra-low PSA responses.

Study Design: Other
Explain: Observational Study using existing, de-identified patient data

What is the purpose of the analysis being proposed? Please select all that apply.: Confirm or validate previously conducted research on treatment effectiveness Participant-level data meta-analysis Meta-analysis using only data from the YODA Project Develop or refine statistical methods Research on clinical prediction or risk prediction

Software Used: Python, RStudio

Data Source and Inclusion/Exclusion Criteria to be used to define the patient sample for your study: Our primary data source will be an existing, de-identified patient dataset obtained from a well-characterized clinical cohort or a secure research registry. Should data from trials made available through the YODA Project become relevant and accessible, the following explicit inclusion/exclusion criteria would be applied to define our study sample, alongside any other utilized datasets:

Inclusion Criteria:

Histologically confirmed diagnosis of prostate cancer.

Received treatment with abiraterone and/or apalutamide.

Availability of comprehensive baseline clinical and demographic characteristics (e.g., age, race, ECOG performance status, prior therapies, baseline PSA, disease stage, Gleason score).

Availability of serial PSA measurements collected post-treatment, sufficient to assess the primary outcome of ultra-low PSA response (<= 0.2 ng/mL).

Exclusion Criteria:

Missing critical baseline demographic or clinical data essential for populating model inputs.

Incomplete PSA follow-up that prevents reliable assessment of the primary outcome (e.g., very early treatment discontinuation without adequate follow-up for response determination).

Patients with significant co-morbidities or active second primary malignancies expected to preclude adequate follow-up or impact PSA response assessment unrelated to prostate cancer therapy.

Data quality issues that render key variables unreliable for analysis.

If data from studies other than those potentially accessed via YODA are used (e.g., from collaborating academic institutions), their respective data dictionaries will be rigorously harmonized to ensure consistency across variables. We plan to conduct individual patient data (IPD) analysis by pooling these harmonized datasets on a secure computational platform, enabling a comprehensive and robust analysis.

Primary and Secondary Outcome Measure(s) and how they will be categorized/defined for your study: Primary Outcome Measure
The primary outcome measure is the prediction of an ultra-low Prostate-Specific Antigen (PSA) response following treatment with abiraterone and/or apalutamide. This is strictly defined as:

Ultra-low PSA Response: A confirmed PSA level of <= 0.2 ng/mL achieved at any point after treatment initiation, maintained for at least two consecutive measurements or until progression. Patients not meeting this criteria are classified as "No Ultra-low PSA Response." This binary classification is our machine learning model's direct target.

Secondary Outcome Measures
We'll also evaluate secondary outcome measures for a comprehensive understanding of response and model performance:

Identification of Key Predictive Variables: This involves pinpointing the most influential patient characteristics and clinical variables (e.g., baseline PSA, Gleason score, age) contributing to the model's ability to predict ultra-low PSA response. This will be assessed via feature importance analyses from trained models.

Model Performance Metrics: We'll rigorously quantify our models' predictive power using standard classification metrics:

Accuracy: Proportion of correctly classified instances.

Precision (Positive Predictive Value): Correctly predicted ultra-low PSA responders among all predicted.

Recall (Sensitivity): Correctly predicted ultra-low PSA responders among all actual.

F1-score: Harmonic mean of precision and recall.

Area Under the Receiver Operating Characteristic Curve (AUC-ROC): Model's ability to distinguish between classes.

Changes to Outcome Measures in Final Analysis
We anticipate no changes to these clearly defined primary and secondary outcome measures in the final analysis reported in our publication. Their definitions are designed for direct comparability and clarity.

Main Predictor/Independent Variable and how it will be categorized/defined for your study: In this study, our main independent variables are a comprehensive set of patient-specific clinical and demographic characteristics, alongside treatment-related factors. These serve as inputs for our QSAR-based machine learning models, which aim to predict the ultra-low PSA response. We are assessing the collective predictive power of these diverse features, rather than a single variable.

Variables will be extracted from the patient dataset and defined for consistency and comparability:

Patient Demographic and Baseline Clinical Characteristics
Age at Treatment Initiation: Continuous (years).

Race/Ethnicity: Categorical (e.g., White, Black, Asian; based on data).

ECOG Performance Status (PS): Categorical (e.g., 0, 1, 2).

Baseline PSA Level: Continuous (ng/mL), pre-treatment.

Gleason Score at Diagnosis: Categorical (e.g., <= 6, 7 (3+4), 7 (4+3), >= 8).

Disease Stage: Categorical (e.g., localized, metastatic; per common staging systems).

Prior Treatments: Binary/Categorical for prior therapies (e.g., chemotherapy, ADT, radiation).

Prior Antiandrogen Use: Binary (Yes/No).

Lactate Dehydrogenase (LDH) Level: Continuous (if available).

Alkaline Phosphatase (ALP) Level: Continuous (if available).

Treatment-Related Factors
Drug Received: Categorical (Abiraterone, Apalutamide, Both/Sequential).

Treatment Duration: Continuous (months), until response assessment/discontinuation.

Other Variables of Interest that will be used in your analysis and how they will be categorized/defined for your study: Beyond the primary predictors, several other variables will be used in our analysis to thoroughly characterize the study sample and for potential multivariable risk adjustment or deeper exploratory analysis. These variables, while not direct inputs for the primary predictive task, offer valuable context and allow for subgroup analyses or sensitivity testing.

Disease Progression and Survival Outcomes
Clinical Progression: Binary variable (Yes/No), indicating progression based on RECIST criteria (for measurable disease), new bone lesions on imaging, or significant clinical worsening. Time to clinical progression (continuous, in months) will also be recorded.

PSA Progression: Binary variable (Yes/No), indicating PSA progression as per PCWG3 criteria (e.g., >= 25% increase and >= 2 ng/mL absolute increase over nadir or baseline, confirmed). Time to PSA progression (continuous, in months) will be recorded.

Overall Survival (OS): Binary variable (Yes/No, indicating death from any cause) and continuous variable (in months from treatment initiation to death or last follow-up).

Progression-Free Survival (PFS): Continuous variable (in months from treatment initiation to progression [clinical or PSA, whichever comes first] or death from any cause).

Concomitant Medications
Steroid Use (e.g., Prednisone): Binary variable (Yes/No), indicating concomitant use during abiraterone treatment.

Bone-Targeting Agents: Binary variable (Yes/No), indicating concomitant use of agents like denosumab or zoledronic acid.

Laboratory Parameters (Longitudinal or at Specific Time Points)
Hemoglobin: Continuous (g/dL).

Creatinine: Continuous (mg/dL or umol/L).

Liver Enzymes (ALT, AST): Continuous (U/L).

Testosterone Level (during treatment): Continuous (ng/dL or nmol/L).

Statistical Analysis Plan: Our statistical analysis plan integrates comprehensive traditional methods with advanced machine learning techniques to thoroughly analyze the patient data and achieve the study objectives.

1. Data Preprocessing and Management
Initial steps will involve rigorous data cleaning, including identification and handling of outliers and inconsistencies. Missing data will be addressed using appropriate imputation strategies (e.g., multiple imputation, k-Nearest Neighbors), with sensitivity analyses performed to evaluate the impact of chosen methods. Variables will undergo necessary feature engineering and transformations (e.g., log-transformations for skewed distributions, one-hot encoding for categorical variables) to optimize model input. The dataset will be randomly partitioned into training, validation (for hyperparameter tuning), and independent test sets to ensure robust and unbiased model evaluation.

2. Descriptive Analyses
We will provide a detailed summary of the study cohort's baseline demographic, clinical, and treatment characteristics. Continuous variables will be reported as mean (standard deviation) or median (interquartile range), as appropriate. Categorical variables will be presented as frequencies and percentages. The overall proportion of patients achieving an ultra-low PSA response within the cohort will be explicitly characterized.

3. Bivariate Analyses
Associations between individual independent variables and the primary outcome (ultra-low PSA response) will be assessed. For categorical predictors, we will employ Chi-square tests or Fisher's exact tests. For continuous predictors, independent t-tests or Mann-Whitney U tests will be utilized. Correlation analyses (e.g., Pearson, Spearman) will examine relationships between continuous variables.

4. Advanced Analyses: QSAR-Based Machine Learning for Prediction (Main Focus)
The core of our analysis involves developing and validating predictive machine learning models for ultra-low PSA response:

Model Selection: We will explore and compare several supervised machine learning classification algorithms adapted for QSAR principles in patient data. Candidate models include Support Vector Machines (SVMs), Random Forests, Gradient Boosting Machines (e.g., XGBoost, LightGBM), and Neural Networks. Logistic Regression will serve as a baseline comparator.

Feature Selection: Techniques such as Recursive Feature Elimination (RFE) or tree-based feature importance methods will be applied to identify the most relevant predictors, reduce dimensionality, and enhance model interpretability and generalization.

Model Training and Hyperparameter Tuning: Models will be trained on the designated training set. Optimal model hyperparameters will be identified through systematic cross-validation (e.g., 5-fold or 10-fold) using methods like Grid Search or Bayesian Optimization on the training/validation sets.

Model Evaluation: Performance will be rigorously assessed on the independent test set using the predefined primary and secondary outcome metrics. Key metrics include Accuracy, Precision, Recall, F1-score, and Area Under the Receiver Operating Characteristic Curve (AUC-ROC). Calibration plots will assess agreement between predicted probabilities and observed outcomes. Confusion matrices will provide detailed insight into classification errors.

Model Interpretability: To understand the contribution of individual features to model predictions, especially for complex "black-box" models, methods like SHAP (SHapley Additive exPlanations) or LIME (Local Interpretable Model-agnostic Explanations) will be employed.

Narrative Summary: This project, "The Use of QSAR Machine Learning Methods in Predicting Ultra-Low PSA Responses in Patients treated with Abiraterone and Apalutamide," aims to predict ultra-low PSA responses in patients receiving abiraterone and apalutamide.

We use QSAR (Quantitative Structure-Activity Relationship) machine learning methods as powerful classification tools. This approach analyzes comprehensive patient data--like clinical variables and treatment histories--to identify patterns. The goal is to build robust models that can classify patients into those likely to achieve an ultra-low PSA response or not.

These models will provide clinicians an invaluable tool, enabling optimized treatment strategies and personalized patient care. By predicting optimal responders, we can improve clinical decision-making, leading to better patient outcomes.

Project Timeline: Here's an estimated timeline for the key milestones of this study. This plan assumes data access is granted promptly and considers the 12-month access period, with potential for extension.

Anticipated Project Start Date:

October 1, 2025 (Upon Data Access Approval)

Data Processing & Initial Analysis Completion:

February 28, 2026 (Approximately 5 months from start)

This phase includes data cleaning, preprocessing, imputation, and preliminary descriptive/bivariate analyses.

Machine Learning Model Development & Validation Completion:

June 30, 2026 (Approximately 9 months from start)

This covers feature engineering, model selection, training, hyperparameter tuning, rigorous validation on independent test sets, and interpretability analysis.

Manuscript Drafted and First Submitted for Publication:

September 30, 2026 (Approximately 12 months from start)

This includes full results interpretation, writing the manuscript, internal reviews, and preparing for journal submission.

Date Results Reported Back to the YODA Project:

October 31, 2026 (Within 13 months of start)

A summary of key findings and the final manuscript will be provided as required by the Data Use Agreement.

This timeline is ambitious but achievable, leveraging existing methodologies and focusing on rapid execution once data access is secured. Should an extension be needed, it would primarily affect the publication and final reporting dates.

Dissemination Plan: Our comprehensive dissemination plan aims to share the study's findings with both scientific and clinical communities, ensuring maximum impact and contribution to medical knowledge and public health.

Anticipated Products
The primary product of this research will be a peer-reviewed scientific manuscript. This manuscript will detail the project's methodology, the performance of the developed QSAR-based machine learning models, the identified key predictive variables for ultra-low PSA response, and the clinical implications of our findings. We may also explore presenting selected results at major international scientific conferences focused on oncology, urology, or machine learning in healthcare. Depending on the depth and breadth of the findings, secondary products could include supplementary methodological reports or data visualization tools that further explain the models.

Target Audiences
Our research targets several key audiences:

Oncologists and Urologists: Physicians specializing in prostate cancer treatment, who will benefit from predictive tools to optimize patient selection for abiraterone and apalutamide.

Medical Researchers: Scientists involved in prostate cancer biology, drug development, and precision medicine, who can build upon our findings regarding predictive biomarkers and machine learning applications.

Methodologists and Data Scientists: Researchers interested in the application of QSAR principles and advanced machine learning techniques in clinical data.

Bibliography:

Luque Ruiz, I., & Gómez-Nieto, M. Á. (2018). QSAR classification and regression models for β-secretase inhibitors using relative distance matrices. SAR and QSAR in environmental research, 29(5), 355--383. https://doi.org/10.1080/1062936X.2018.1442879
Luque Ruiz, I., & Gómez-Nieto, M. Á. (2018). Study of Data Set Modelability: Modelability, Rivality, and Weighted Modelability Indexes. Journal of chemical information and modeling, 58(9), 1798--1814. https://doi.org/10.1021/acs.jcim.8b001883.

3. Luque Ruiz, I., & Gómez-Nieto, M. Á. (2018). Regression Modelability Index: A New Index for Prediction of the Modelability of Data Sets in the Development of QSAR Regression Models. Journal of chemical information and modeling, 58(10), 2069--2084. https://doi.org/10.1021/acs.jcim.8b00313

4. Wang, T., Yuan, X. S., Wu, M. B., Lin, J. P., & Yang, L. R. (2017). The advancement of multidimensional QSAR for novel drug discovery – where are we headed?. Expert opinion on drug discovery, 12(8), 769--784. https://doi.org/10.1080/17460441.2017.1336157

5. Lee, C., Light, A., Alaa, A., Thurtle, D., van der Schaar, M., & Gnanapragasam, V. J. (2021). Application of a novel machine learning framework for predicting non-metastatic prostate cancer-specific mortality in men using the Surveillance, Epidemiology, and End Results (SEER) database. The Lancet. Digital health, 3(3), e158--e165. https://doi.org/10.1016/S2589-7500(20)30314-9