Separating the wheat from the chaff: on feature selection and feature importance in regression random forests and symbolic regression

Created by W.Langdon from gp-bibliography.bib Revision:1.3872

@InProceedings{Stijven:2011:GECCOcomp,
  author =       "Sean Stijven and Wouter Minnebo and 
                 Katya Vladislavleva",
  title =        "Separating the wheat from the chaff: on feature
                 selection and feature importance in regression random
                 forests and symbolic regression",
  booktitle =    "3rd symbolic regression and modeling workshop for
                 GECCO 2011",
  year =         "2011",
  editor =       "Steven Gustafson and Ekaterina Vladislavleva",
  isbn13 =       "978-1-4503-0690-4",
  keywords =     "genetic algorithms, genetic programming",
  pages =        "623--630",
  month =        "12-16 " # jul,
  organisation = "SIGEVO",
  address =      "Dublin, Ireland",
  DOI =          "doi:10.1145/2001858.2002059",
  publisher =    "ACM",
  publisher_address = "New York, NY, USA",
  abstract =     "Feature selection in high-dimensional data sets is an
                 open problem with no universal satisfactory method
                 available. In this paper we discuss the requirements
                 for such a method with respect to the various aspects
                 of feature importance and explore them using regression
                 random forests and symbolic regression. We study
                 'conventional' feature selection with both methods on
                 several test problems and a case study, compare the
                 results, and identify the conceptual differences in
                 generated feature importances.

                 We demonstrate that random forests might overlook
                 important variables (significantly related to the
                 response) for various reasons, while symbolic
                 regression identifies all important variables if models
                 of sufficient quality are found. We explain the results
                 by the fact that variable importance obtained by these
                 methods have different semantics.",
  notes =        "Also known as \cite{2002059} Distributed on CD-ROM at
                 GECCO-2011.

                 ACM Order Number 910112.",
}

Genetic Programming entries for Sean Stijven Wouter Minnebo Ekaterina (Katya) Vladislavleva

Citations