@techreport{TD:100380, att_abstract={{In order to better protect and conserve biodiversity, ecologists are making increasing use of machine learning and statistical modeling to understand how species respond to their environment and to predict how they will respond to future climate change, habitat loss and other threats. A fundamental modeling task is to estimate the conditional probability that a given species is present in (or uses) a site, conditional on environmental variables such as precipitation and temperature. For a limited number of species, survey data consisting of both presence and absence records are available, and can be used to fit a variety of conventional classification and regression models. For most species, however, the available data consists only of occurrence dataâ€”locations where the species has been observed. In two closely-related but separate bodies of ecological literature, a diversity of specialpurpose models have been developed that contrast occurrence data with a random sample of available environmental conditions. The most widespread statistical approaches involve either fitting an exponential model of speciesâ€™ conditional probability of presence, or fitting a naive logistic model in which the random sample of available conditions is treated as absence data; both approaches have wellknown problems, and in particular, do not necessarily produce valid probabilities. In this paper, after summarizing existing methods and their drawbacks, we overcome those drawbacks by introducing a new scaled binomial loss function that is straightforward to integrate into existing methods such as GLM, GAM, and boosted regression trees, in order to estimate an underlying logistic model of species presence/ absence. Our approach is simpler than the Expectation- Maximization approach of Ward et al., which has not yet been used by ecologists despite giving valid probabilities. Following Ward et al., our method requires an estimate of population prevalence, since prevalence is not identifiable from occurrence data alone. We demonstrate that recent approaches (such as the weighted distribution method of Lele and Keim) that try to avoid the identifiability issue by making parametric data assumptions do not typically produce valid probability estimates. Lastly, we introduce two additional new methods based on maximum entropy and a Chernoff bound that both also estimate the underlying logistic model given an estimate of prevalence.}}, att_authors={sp8212}, att_categories={}, att_copyright={{Association for the Advancement of Artificial Intelligence}}, att_copyright_notice={{The definitive version was published in AAAI conference. {{, 2011-08-07}} }}, att_donotupload={}, att_private={false}, att_projects={}, att_tags={}, att_techdoc={true}, att_techdoc_key={TD:100380}, att_url={http://web1.research.att.com:81/techdocs_downloads/TD:100380_DS1_2011-02-09T19:00:04.531Z.pdf}, author={Steven Phillips and Jane Elith}, institution={{AAAI conference}}, month={August}, title={{Logistic methods for resource selection functions and presence-only species distribution models}}, year=2011, }