Created by W.Langdon from gp-bibliography.bib Revision:1.4420

@Misc{moore:2000:CAMDA, author = "Jason H. Moore and Joel S. Parker and Lance W. Hahn", title = "Symbolic Discriminant Analysis for Mining Gene Expression Patterns", booktitle = "Critical Assessment of Techniques for Microarray Data Analysis (CAMDA00)", year = "2000", address = "Levine Science Research Building, Duke University, Durham, N.C.", month = "18-19 " # dec, note = "submitted abstract", keywords = "genetic algorithms, genetic programming, SDA", URL = "http://www.camda.duke.edu/camda00/papers/days/papers/moore/paper.pdf", URL = "http://bioinformatics.duke.edu/CAMDA/CAMDA00/posters.asp#11", size = "1 page", abstract = "Linear discriminant analysis is a popular multivariate statistical approach for classification of observations into groups because the theory is well described and the method is easy to implement and interpret. However, an important limitation is that linear discriminant functions need to be pre-specified. That is, specific variables need to be selected and added linearly into the model. Only the coefficients are estimated from the data. To address this limitation, we developed symbolic discriminant analysis (SDA) for the automatic selection of gene expression variables and discriminant functions that can take any form. Our SDA approach is inspired by the symbolic regression approach of Koza (1992). We begin by defining the mathematical functions (e.g. +, -, /, *, log, sqrt, etc.) and the list of gene expression variables that could potentially be used as the building blocks for discriminant functions. Symbolic discriminant functions are evaluated by generating discriminant scores for each observation to be classified. The overlap in distributions of discriminant scores between groups is an estimate of the classification error. Class membership for new observations can be predicted from the discriminant score that separates the distributions. To identify optimal symbolic discriminant functions from the near infinite model space, we employed parallel genetic programming for machine learning on a 110 processor Beowulf-style parallel supercomputer. We applied the SDA approach to identifying subsets of gene expression variables and symbolic discriminant functions that can correctly classify and predict types of human acute leukemia. Using a leave-one-out cross-validation strategy, we identified no fewer than 15 different combinations of gene expression variables and symbolic discriminant functions that correctly classified 38/38 observations in the first dataset and correctly predicted 31/34 observations in the independent dataset. The most common gene identified across these models was the human synaptonemal complex protein 1 (SCP1) gene that is expressed in solid tumors and haematological malignancies. We conclude that the SDA approach provides a powerful alternative to traditional multivariate statistical methods for identifying gene expression patterns. The advantages of SDA include the ability to identify an important subset of gene expression variables from among thousands of candidates and the ability to identify the most appropriate mathematical functions relating the gene expression variables to a clinical endpoint. We anticipate this will be an important methodology to add to the repertoire of approaches for mining gene expression patterns.", notes = "Program in Human Genetics, Department of Molecular Physiology and Biophysics, Vanderbilt University Medical School, Nashville, TN 37232-0700", }

Genetic Programming entries for Jason H Moore Joel S Parker Lance W Hahn