@techreport{TD:100380,
	att_abstract={{In order to better protect and conserve biodiversity, ecologists
are making increasing use of machine learning and statistical
modeling to understand how species respond to their
environment and to predict how they will respond to future
climate change, habitat loss and other threats. A fundamental
modeling task is to estimate the conditional probability
that a given species is present in (or uses) a site, conditional
on environmental variables such as precipitation and temperature.
For a limited number of species, survey data consisting
of both presence and absence records are available, and
can be used to fit a variety of conventional classification and
regression models. For most species, however, the available
data consists only of occurrence data´┐Żlocations where the
species has been observed. In two closely-related but separate
bodies of ecological literature, a diversity of specialpurpose
models have been developed that contrast occurrence
data with a random sample of available environmental
conditions. The most widespread statistical approaches involve
either fitting an exponential model of species´┐Ż conditional
probability of presence, or fitting a naive logistic
model in which the random sample of available conditions
is treated as absence data; both approaches have wellknown
problems, and in particular, do not necessarily produce
valid probabilities. In this paper, after summarizing
existing methods and their drawbacks, we overcome those
drawbacks by introducing a new scaled binomial loss function
that is straightforward to integrate into existing methods
such as GLM, GAM, and boosted regression trees, in order
to estimate an underlying logistic model of species presence/
absence. Our approach is simpler than the Expectation-
Maximization approach of Ward et al., which has not yet
been used by ecologists despite giving valid probabilities.
Following Ward et al., our method requires an estimate of
population prevalence, since prevalence is not identifiable
from occurrence data alone. We demonstrate that recent approaches
(such as the weighted distribution method of Lele
and Keim) that try to avoid the identifiability issue by making
parametric data assumptions do not typically produce
valid probability estimates. Lastly, we introduce two additional
new methods based on maximum entropy and a Chernoff
bound that both also estimate the underlying logistic
model given an estimate of prevalence.}},
	att_authors={sp8212},
	att_categories={},
	att_copyright={{Association for the Advancement of Artificial Intelligence}},
	att_copyright_notice={{The definitive version was published in AAAI conference. {{, 2011-08-07}}
}},
	att_donotupload={},
	att_private={false},
	att_projects={},
	att_tags={},
	att_techdoc={true},
	att_techdoc_key={TD:100380},
	att_url={http://web1.research.att.com:81/techdocs_downloads/TD:100380_DS1_2011-02-09T19:00:04.531Z.pdf},
	author={Steven Phillips and Jane Elith},
	institution={{AAAI conference}},
	month={August},
	title={{Logistic methods for resource selection functions and presence-only species
distribution models}},
	year=2011,
}