@article {Cao2021.08.26.457778, author = {Han Cao and Youcheng Zhang and Jan Baumbach and Paul R Burton and Dominic Dwyer and Nikolaos Koutsouleris and Julian Matschinske and Yannick Marcon and Sivanesan Rajan and Thilo Rieg and Patricia Ryser-Welch and Julian Sp{\"a}th and The COMMITMENT consortium and Carl Herrmann and Emanuel Schwarz}, editor = {Schwarz, Emanuel and Aln{\ae}s, Dag and Andreassen, Ole A. and Cao, Han and Chen, Junfang and Degenhardt, Franziska and Doncevic, Daria and Dwyer, Dominic and Eils, Roland and Erdmann, Jeanette and Herrmann, Carl and Hofmann-Apitius, Martin and Koutsouleris, Nikolaos and Kodamullil, Alpha T. and Khuntia, Adyasha and Mucha, S{\"o}ren and N{\"o}then, Markus M. and Paul, Riya and Pedersen, Mads L. and Schunkert, Heribert and Tost, Heike and Westlye, Lars T. and Zhang, Youcheng and Meyer-Lindenberg, Andreas}, title = {dsMTL - a computational framework for privacy-preserving, distributed multi-task machine learning}, elocation-id = {2021.08.26.457778}, year = {2021}, doi = {10.1101/2021.08.26.457778}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Multitask learning allows the simultaneous learning of multiple {\textquoteleft}communicating{\textquoteright} algorithms. It is increasingly adopted for biomedical applications, such as the modeling of disease progression. As data protection regulations limit data sharing for such analyses, an implementation of multitask learning on geographically distributed data sources would be highly desirable. Here, we describe the development of dsMTL, a computational framework for privacy-preserving, distributed multi-task machine learning that includes three supervised and one unsupervised algorithms. dsMTL is implemented as a library for the R programming language and builds on the DataSHIELD platform that supports the federated analysis of sensitive individual-level data. We provide a comparative evaluation of dsMTL for the identification of biological signatures in distributed datasets using two case studies, and evaluate the computational performance of the supervised and unsupervised algorithms. dsMTL provides an easy- to-use framework for privacy-preserving, federated analysis of geographically distributed datasets, and has several application areas, including comorbidity modeling and translational research focused on the simultaneous prediction of different outcomes across datasets. dsMTL is available at https://github.com/transbioZI/dsMTLBase (server-side package) and https://github.com/transbioZI/dsMTLClient (client-side package).Competing Interest StatementAML has received consultant fees from: Boehringer Ingelheim, Elsevier, Brainsway, Lundbeck Int. Neuroscience Foundation, Lundbeck A/S, The Wolfson Foundation, Bloomfield Holding Ltd, Shanghai Research Center for Brain Science, Thieme Verlag, Sage Therapeutics, v Behring Roentgen Stiftung, Fondation FondaMental, Janssen-Cilag GmbH, MedinCell, Brain Mind Institute, Agence Nationale de la Recherche, CISSN (Catania Internat. Summer School of Neuroscience), Daimler und Benz Stiftung, American Association for the Advancement of Science, Servier International. Additionally he has received speaker fees from: Italian Society of Biological Psychiatry, Merz-Stiftung, Forum Werkstatt Karlsruhe, Lundbeck SAS France, BAG Psychiatrie Oberbayern, Klinik fuer Psychiatrie und Psychotherapie Ingolstadt, med Update GmbH, Society of Biological Psychiatry, Siemens Healthineers, Biotest AG. All other authors have no potential conflicts of interest.}, URL = {https://www.biorxiv.org/content/early/2021/08/28/2021.08.26.457778}, eprint = {https://www.biorxiv.org/content/early/2021/08/28/2021.08.26.457778.full.pdf}, journal = {bioRxiv} }