% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Young:269770,
author = {Young, Cameron C and Eason, Katherine and Manzano Garcia,
Raquel and Moulange, Richard and Mukherjee, Sach and Chin,
Suet-Feung and Caldas, Carlos and Rueda, Oscar M},
title = {{D}evelopment and validation of a reliable {DNA}
copy-number-based machine learning algorithm ({C}opy{C}lust)
for breast cancer integrative cluster classification.},
journal = {Scientific reports},
volume = {14},
number = {1},
issn = {2045-2322},
address = {[London]},
publisher = {Macmillan Publishers Limited, part of Springer Nature},
reportid = {DZNE-2024-00612},
pages = {11861},
year = {2024},
abstract = {The Integrative Cluster subtypes (IntClusts) provide a
framework for the classification of breast cancer tumors
into 10 distinct groups based on copy number and gene
expression, each with unique biological drivers of disease
and clinical prognoses. Gene expression data is often
lacking, and accurate classification of samples into
IntClusts with copy number data alone is essential. Current
classification methods achieve low accuracy when gene
expression data are absent, warranting the development of
new approaches to IntClust classification. Copy number data
from 1980 breast cancer samples from METABRIC was used to
train multiclass XGBoost machine learning algorithms
(CopyClust). A piecewise constant fit was applied to the
average copy number profile of each IntClust and unique
breakpoints across the 10 profiles were identified and
converted into ~ 500 genomic regions used as features for
CopyClust. These models consisted of two approaches: a
10-class model with the final IntClust label predicted by a
single multiclass model and a 6-class model with binary
reclassification in which four pairs of IntClusts were
combined for initial multiclass classification. Performance
was validated on the TCGA dataset, with copy number data
generated from both SNP arrays and WES platforms. CopyClust
achieved $81\%$ and $79\%$ overall accuracy with the TCGA
SNP and WES datasets, respectively, a nine-percentage point
or greater improvement in overall IntClust subtype
classification accuracy. CopyClust achieves a significant
improvement over current methods in classification accuracy
of IntClust subtypes for samples without available gene
expression data and is an easily implementable algorithm for
IntClust classification of breast cancer samples with copy
number data.},
keywords = {Humans / Breast Neoplasms: genetics / Breast Neoplasms:
classification / Machine Learning / Female / DNA Copy Number
Variations: genetics / Algorithms / Cluster Analysis / Gene
Expression Profiling: methods},
cin = {AG Mukherjee},
ddc = {600},
cid = {I:(DE-2719)1013030},
pnm = {354 - Disease Prevention and Healthy Aging (POF4-354)},
pid = {G:(DE-HGF)POF4-354},
typ = {PUB:(DE-HGF)16},
pubmed = {pmid:38789621},
pmc = {pmc:PMC11126405},
doi = {10.1038/s41598-024-62724-6},
url = {https://pub.dzne.de/record/269770},
}