Created by W.Langdon from gp-bibliography.bib Revision:1.2031
@TechReport{langdon:2008:CES-483,
author = "W. B. Langdon and A. P. Harrison",
title = "Evolving Regular Expressions for {GeneChip} Probe
Performance Prediction",
institution = "Computing and Electronic Systems",
year = "2008",
number = "CES-483",
address = "University of Essex, Wivenhoe Park, Colchester CO4
3SQ, UK",
month = "27 " # apr,
keywords = "genetic algorithms, genetic programming,
Bioinformatics, Affymetrix GeneChip, strongly typed
genetic programming, STGP, grammar, regular expression,
egrep, gawk",
URL = "
http://www.essex.ac.uk/dces/research/publications/technicalreports/2008/CES-483.pdf",
ISSN = "1744-8050",
abstract = "Commercial GeneChips provide highly redundant but
noisy data. Rapid identification and subsequent
rejection of bad data effectively increases the quality
of the remaining data at little cost whilst serving as
a basis for better understanding the bio-physics of
short surface mounted DNA sequences.
Affymetrix High Density Oligonuclotide Arrays (HDONA)
simultaneously measure expression of thousands of genes
using millions of probes. Regular expressions can be
evolved from a Backus-Naur form (BNF) context-free
grammar using tree based strongly typed genetic
programming written in gawk. Fitness is given by egrep.
The quality of individual HG-U133A probes is indicated
by its correlation across 6685 human tissue samples
from NCBI's GEO database with other measurements for
the same gene. Low concordance indicates a poor probe.
The evolved data mined motif is better at predicting
poor DNA sequences than an existing human generated RE,
suggesting runs of Cytosine and Guanine and mixtures
should all be avoided. Section 4.6 gives more RE GP
gawk implementation details.
Code is available at
ftp://cs.ucl.ac.uk/genetic/gp-code/RE_gp.tar",
size = "18 pages",
}
Genetic Programming entries for William B Langdon A P Harrison