proof.tex

\documentclass[entropy,article,accept,pdftex,moreauthors]{Definitions/mdpi} 

\firstpage{1} 
\makeatletter 
\setcounter{page}{\@firstpage} 
\makeatother
\pubvolume{1}
\issuenum{1}
\articlenumber{0}
\pubyear{2024}
\copyrightyear{2024}
\externaleditor{\textls[-25]{Academic Editor: Firstname Lastname}}
\datereceived{19 April 2024} 
\daterevised{25 May 2024} % Comment out if no revised date
\dateaccepted{31 May 2024} 
\datepublished{ } 

\hreflink{https://doi.org/} % If needed use \linebreak

\Title{Multimodel %MDPI: Dear Authors, 
  %(1) Please use the final version file to do proofreading.
  %(2) Please proofread the article format according to the comments left and pay attention to the highlighted parts. 
  %(3) Please do not delete the comments during your proofreading, since we need to check point by point after receiving your proofed version.
  %(4) Please revise with track change Function and answer all questions that we pro-posed. Such as: “It should be italic”; “I confirm”; “I have checked and revised all”.
  %% BMB: what do you mean by "track changes function"? This exists in Word and OpenOffice, but I don't understand how it applies to a LaTeX document.
  %(5) To avoid misunderstanding your response, please directly correct on this version.
  %(6) English editing and layout have been complete, please read carefully and check if the intended meaning has been retained and make adjustments if needed.
 Approaches Are Not the Best Way to Understand Multifactorial Systems}


\TitleCitation{Multimodel Approaches Are Not the Best Way to Understand Multifactorial Systems}


\newcommand{\orcidauthorA}{0000-0002-2127-0443} % Add \orcidA{} behind the author's name


\Author{Benjamin M. Bolker %MDPI: 1 Please carefully check the accuracy of name and affiliation.  Authorship can not be changed during this period (including adding new authors/corresponding authors/affiliations or deleting present authors/corresponding authors/affiliations or exchanging author orders). 2. We delete the note sign, please confirm, if we must keep the sign, please add the explanation for it.
  %% BMB: name and affiliation are correct. Notes were only present because they were generated by the template; deleting them is fine.
 \orcidA{}}


\AuthorNames{Benjamin M. Bolker}


\AuthorCitation{Bolker, B.M. %MDPI: Please carefully check the accuracy of name. %% BMB: correct.
}


\address[1]{%
 Departments of Mathematics \& Statistics and Biology, McMaster University, Hamilton, ON L8S4K1, Canada; bolker@mcmaster.ca
}


\abstract{Information-theoretic (IT) and multi-model averaging (MMA) statistical
approaches are widely used but suboptimal tools for pursuing a
multifactorial approach (also known as the method of multiple working
hypotheses) in ecology. (1) Conceptually, IT encourages ecologists to
perform tests on sets of artificially simplified models. (2) MMA
improves on IT model selection by implementing a simple form of shrinkage %MDPI:We remioved the italic, please confirm and check all  in the whole text. %% BMB: I prefer to use highlighting to introduce new terms but I can live without it if it conflicts with the house style.
 estimation (a way to make accurate predictions from a
model with many parameters relative to the amount of data, by
``shrinking'' parameter estimates toward zero). However, other shrinkage
estimators such as penalized regression or Bayesian hierarchical models
with regularizing priors are more computationally efficient and better
supported theoretically.\linebreak (3) In general, the procedures for extracting
confidence intervals from MMA are overconfident, providing overly narrow
intervals. If researchers want to use limited data sets to accurately
estimate the strength of multiple competing ecological processes along
with reliable confidence intervals, the current best approach is to use
full (maximal) statistical models (possibly with Bayesian priors) after
making principled, {a priori} decisions about model complexity.
}

% Keywords
\keyword{null-hypothesis significance testing; multi-model averaging; shrinkage estimators; Akaike information criterion; statistical inference} 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Modern scientific research often aims to quantify the effects of
multiple simultaneously operating processes in natural or human systems.
Some examples from my own work in ecology and evolution consider the
effects of herbivory and fertilization on standing biomass \cite{ref-Gruner+2008}; %MDPI: Please cite all references with reference numbers and place the numbers in square brackets (“[ ]”), e.g., [1], [1–3], or [1,3]. And refrence should be cited in the numberial order. Please refer to the following website for more information: http://www.mdpi.com/authors/references. e.g., change “((Gruner et al. 2008))” to  \cite{ref-Gruner+2008} and reorder the ref “{ref-Gruner+2008}” as ref 1 in references section. Please revise all of them and ensure all the citations match up with the References in the References section.
%% BMB: this is a terrible nuisance. I followed all of the instructions to authors as carefully as I could, including the provided LaTeX template. It would have been extremely useful to know this earlier so I could have used bibtex to generate this properly.
 the effects of bark, wood density, and fire on tree mortality
\cite{ref-brando_fire-induced_2012}; or the effects of taxonomic and genomic position
on evolutionary rates \cite{ref-ghenu_multicopy_2016}. This multifactorial
approach \cite{ref-mcgill_why_2016} complements, rather than replacing, the
traditional hypothesis-testing or strong-inferential framework \cite{ref-platt_strong_1964, ref-fox_why_2016, ref-betini_why_2017}. (While there   is much interesting debate over the best methods for gathering evidence to distinguish among two or more particular, intrinsically discrete hypotheses \cite{ref-taper_evidential_2015},
  that is not the focus of this paper.) %MDPI: Since footnote is not allowed in the journal. We moved note into main text. please confirm and check the whole text carefully. %% BMB: confirmed
 Such attempts to quantify the
magnitude or importance of different processes also differ from
predictive modeling, which dominates the fields of machine learning and
artificial intelligence \cite{ref-hastieElements2009}.
The prediction and quantification of process strength are closely related---if we can accurately predict outcomes over a range of conditions,
then we can also predict the effects of changes in those conditions, and
hence infer the strengths of processes, if the changes we are trying
to predict are adequately reflected in our training data. However,
predictive modelers are usually primarily concerned with predictions
within the natural range of conditions, which may not provide us enough
information to reliably make inferences about processes. The paper
focuses on statistical modeling for estimation and inference, rather
than prediction.

A standard approach to analyzing multifactorial systems, particularly
common in ecology, is as follows: (1) Construct a full model that
encompasses as many of the processes (and their interactions) as is
feasible. (2) Fit the full model and make sure that it describes the
data reasonably well (e.g.,~by examining model diagnostics and by ensuring that the level of unexplained variation is not unacceptably large).
(3) Construct possible submodels of the full model by setting the subsets of
parameters to zero. (4) Compute the information-theoretic measures of
quality, such as the Akaike or Bayesian/Schwarz information criteria,
for every submodel. (5) Use multi-model averaging (MMA) to estimate
model-averaged parameters and confidence intervals (CIs), and possibly draw
conclusions about the importance of different processes by summing the
information-theoretic weights \cite{ref-burnham_model_2002}. I argue that
this approach, even if used sensibly as advised by proponents of the
approach (e.g.~with reasonable numbers of candidate submodels), is a
poor way to approach estimation and inference for multifactorial
problems.

For example, suppose we want to understand the effects of
ecosystem-level net primary productivity and fire intensity on species
diversity (a simplified version of the analysis done in \cite{ref-moritzRole2023a}). The model-comparison or model-averaging
approach would construct five models: a null model with no effects of
either productivity or fire, two single-factor models, an additive
model, and a full model allowing for interactions between productivity
and fire. We would then fit all of these models and model-average their
parameters, and derive model-averaged confidence intervals.

The goal of a multifactorial analysis is to tease apart the
contributions of many processes, all of which we believe are
affecting our study system to some degree. If our scientific questions
are (something like) ``How important is this factor, in an absolute
sense or relative to other factors?'' (or equivalently, ``How much does
a change in this factor change the system in absolute or relative
terms?''), rather than ``Which of these factors are having {any
effect at all} on my system?'', why are we working so hard to fit many
models of which only one (the full model) incorporates all of the
factors? If we do not have particular, {a priori} discrete
hypotheses about our system (such as ``process \(A\) influences the
outcome but process \(B\) has no effect at all''), why does so much of
our data-analytic effort go into various ways to test between, or
combine and reconcile, multiple discrete models? In software
development, this is called an ``XY problem'' (\url{http://www.perlmonks.org/?node=XY+Problem}, accessed on 2 June 2024): %MDPI: 1 Please add the access date (format: Date Month Year), e.g., accessed on 1 January 2020. 2 Please check if the link is valid, we cannot accessed on it. %% BMB: confirmed that I was able to access it today.
rather than thinking about the best way to solve our real problem \(X\)
(understanding multifactorial systems), we have become bogged down in
the details of how to make a particular tool, \(Y\) (multimodel
approaches), provide the answers we need. %Please check intended meaning is retained  %% BMB: this is OK.
Most critiques of MMA address
technical concerns such as the influence of unobserved heterogeneity
\cite{ref-brewer_relative_2016}
or criticize the misuse of
information-theoretic methods by researchers \cite{ref-mundryIssues2011,ref-cade_model_2015},
but do not ask why we are comparing discrete models in the first place.

In contrast with averaging across discrete hypotheses or treating a
choice of discrete hypotheses as an end goal, fitting and comparing
multiple models as a step in a null-hypothesis significance testing
(NHST) procedure is defensible. In the biodiversity analysis described
above, we might fit the full model and then assess the significance of
individual terms by comparing the fit of the full model to models with
those terms dropped (taking particular care with the interpretation of
dropping a lower-level effect in models with interactions, e.g.,~see
\cite{ref-bernhardtInterpretation1979}). While much maligned, NHSTs are a useful part
of data analysis---not to decide whether we really think a null
hypothesis is false (they almost always are), but to see if we can
distinguish signal from noise. Another interpretation is that NHSTs can
test whether we can reliably determine the direction of effects---that is, not whether the effect of a predictor on some process is
zero, but whether we can tell unequivocally that it has a particular
sign---positive or negative \cite{ref-jones_sensible_2000,ref-dushoff_i_2019}.

However, researchers using multimodel approaches are not fitting
one-step-reduced models to test hypotheses; rather, they are fitting a
wide range of submodels, typically in the hope that model choice or
multimodel averaging will help them deal with insufficient data in a
multifactorial world. If we had enough information (even Big Data
does not always provide the information we need \cite{ref-mengStatistical2018}, we could
fit only the full model, drawing our conclusions from the estimates and
CIs with all of the factors considered simultaneously. But we nearly
always have too many predictors, and not enough data; we do not want to
overfit, (which will inflate our CIs and \(p\)-values to the point where
we cannot tell anything for sure), but at the same time we are afraid of
neglecting potentially important effects.

Stepwise regression, the original strategy for separating signals from
noise, is now widely deprecated because it interferes with correct
statistical inference \cite{ref-harrell_regression_2001,ref-romanoStepwise2005,ref-whittingham_why_2006,ref-mundryStepwise2009}. Information-theoretic tools mitigate
the instability of stepwise approaches, allow for the simultaneous comparison of
many non-nested models, and avoid the stigma of NHST. A further step
forward, multi-model averaging \cite{ref-burnham_model_2002}, accounts for
model uncertainty and avoids focusing on a single best model. Some forms
of model averaging provide {shrinkage estimators}; averaging the
strength of effects between models where they are included and models
where they are absent adjusts the estimated effects toward zero \cite{ref-cade_model_2015}. More recently, model averaging is experiencing a backlash, as
studies point out that multimodel averaging may run into trouble when
variables are collinear and/or have differential levels of measurement
error \cite{ref-freckleton_dealing_2011}, when we are careless about the meaning of
main effects in the presence of interactions, when we average model
parameters rather than model predictions \cite{ref-cade_model_2015}, or when we use
summed model weights to assess the relative importance of predictors
(\cite{ref-galipaud_ecologists_2014,ref-cade_model_2015}; but cf. \cite{ref-zhang_model_2015}).

Freckleton \cite{ref-freckleton_dealing_2011} makes the point that model averaging will tend to
shrink the estimates of multicollinear predictors toward each other, so
that estimates of weak effects will be biased upward and estimates of
strong effects will be biased downward. This is an unsurprising (in hindsight)
consequence of shrinkage estimation. With other analytical methods such
as lasso regression, or selection of a single best model by AIC, the
weaker of two correlated predictors, or more precisely the one that
appears weaker based on the available data, could be eliminated
entirely, leading all of its effects to be attributed to the stronger
predictor. Researchers often make a case for dropping correlated terms
in this way because collinearity of predictors inflates parameter
uncertainty and complicates interpretation. However, others have
repeatedly pointed out that collinearity is a problem of intrinsic
uncertainty---we are simply missing the data that would tell us which
combination of collinear factors really drives the system. The
confidence intervals of parameters from a full model estimated by
regression or maximum likelihood will correctly identify this
uncertainty; modeling procedures that automatically drop collinear
predictors (by model selection or sparsity-inducing penalization) not
only fail to resolve the issue, but can lead to inaccurate predictions
based on new data \cite{ref-grahamConfronting2003,ref-morrisseyMultiple2018,ref-fengCollinearity2019a,ref-vanhoveCollinearity2021}.
A full model might (correctly) tell us we cannot
confidently assess whether either productivity or fire decrease or
increase species diversity, because their estimated effects are strongly
correlated. However, by comparing the fit of the full model to one that
dropped both productivity and fire, we could conclude that their joint
effect is highly significant.

In ecology, information criteria were introduced by applied ecologists
who were primarily interested in making the best possible predictions to
inform conservation and management; they were less concerned with
inference or quantifying the strength of underlying processes \cite{ref-BurnAnde98,ref-burnham_model_2002,ref-johnsonModel2004a}. Rather than using
information criteria as tools to identify the best predictive model, or
to obtain the best overall (model-averaged) predictions, most current
users of information-theoretic methods use them either to quantify
variable importance, or, by multimodel averaging, to have their cake and
eat it too---to avoid either over- or underfitting while quantifying the
effects in multifactorial systems. There are two problems with this
approach---one conceptual and one practical.

The conceptual problem with model averaging reflects the original sin of
unnecessarily discretizing a continuous model space. When we fit many
different models as part of our analytical process (based on selection
or averaging), the models are only a means to an end; despite the claims
of some information-theoretic modelers, we are not really using the
submodels in support of the method of multiple working hypotheses as
described by Chamberlin \cite{ref-chamberlinMethod1890}. For example, Chamberlin argued that in
teaching about the origin of the Great Lakes, we should urge students
``to conceive of three or more great agencies {[}pre-glacial erosion,
glacial erosion, crust deformation{]} working successively or
simultaneously, and to estimate how much was accomplished by each of
these agencies''. Chamberlin was {not} suggesting that we test
which individual mechanism or combination of mechanisms fits the data
best (in whatever sense), but instead that we acknowledge that the world
is multifactorial. In a similar vein, Gelman and Shalizi \cite{ref-gelmanPhilosophy2013} advocate
``continuous model expansion'', creating models that include all
components of interest (with appropriate Bayesian priors to constrain
the overall complexity of the model) rather than selecting or averaging
across discrete sets of models that incorporate subsets of the
processes.

Here, I am not concerned whether `truth' is included in our model set (it
is not), and how this matters to our inference \cite{ref-bernardoBayesian1994,ref-barker_truth_2015}.
I am claiming the opposite, that our full model---while certainly not the true model---is usually the closest
thing we have to a true model. This claim seems to contradict the
information-theoretic result that the best approximating model (i.e.,
the minimum-AIC model) is expected to be closest to the true
(generating) model in a predictive sense (i.e., it has the smallest
expected Kullback--Leibler distance) \cite{ref-poncianoMultimodel2018}. However,
the fact that excluding some processes allows the fitted model to better
match the observation does not mean that we should believe these
processes are not affecting on our system---just that, with the available
data, dropping terms will provide us better predictions than keeping the
full model. %Please check intended meaning is retained %% BMB: OK
 If we are primarily interested in prediction, or in
 comparing qualitatively different, possibly non-nested hypotheses
 \cite{ref-luttbeg_comparing_2004}, information-theoretic methods
match our goals well.

The technical problem with model averaging is its computational
inefficiency. Individual models can take minutes or hours to fit, and we
may have to fit dozens or scores of sub-models in the multi-model
averaging process. There are efficient tools available for fitting
``right-sized'' models that avoid many of the technical problems of
model averaging. Penalized methods such as ridge and lasso regression
\cite{ref-dahlgren_alternative_2010} are well known in some scientific fields; in a Bayesian
setting, informative priors centered at zero have the same effect of
 regularizing---pushing weak effects toward zero and controlling
model complexity (more or less synonymous with the {shrinkage} of
estimates described above) \cite{ref-lemoineMoving2019a}. Developed for optimal
(predictive) fitting in models with many parameters, penalized models
have well-understood statistical properties; they avoid the pitfalls of
model-averaging correlated or nonlinear parameters; and, by avoiding the
need to fit many sub-models in the model-averaging processes, they are
much faster. (However, they may require a computationally
  expensive cross-validation step in order to choose the degree of
  penalization.) %% BMB: OK
Furthermore, penalized approaches underlie modern
nonparametric methods such as additive models and Gaussian processes
that allow models to expand indefinitely to match the available data
\cite{ref-rasmussenGaussian2005,ref-woodGeneralized2017}.

Penalized models have their own challenges. A big advantage of
information-theoretic methods is that, like wrapper methods for feature
selection in machine learning \cite{ref-chandrashekarSurvey2014}, we can use
model averaging, as long as we can fit component models and extract the
log-likelihood and number of parameters---we never need to build new
software. Although powerful computational tools exist for fitting
penalized versions of linear and generalized linear models (e.g.,~the
\texttt{{glmnet}} package for R) and mixed models (\texttt{glmmLasso}),
quantile regression \cite{ref-koenkerQuantile2017}, software for some more exotic models
(e.g.,~zero-inflated models, models for censored data) may not be readily
available. Fitting these models requires the user to choose the strength
of penalization. This process is conveniently automated in tools like
\texttt{glmnet}, but correctly assessing the out-of-sample accuracy (and
hence the correct level of penalization) is tricky for data that are
correlated in space or time \cite{ref-wenger_assessing_2012,ref-robertsCrossvalidation2016}.
Penalization (or regularization) can also be achieved by imposing
Bayesian priors on subsets of parameters \cite{ref-chungNondegenerate2013}, but this
converts the choice of strength of penalization to a similarly
challenging choice of appropriate priors.

Finally, frequentist inference (computing \(p\)-values and CIs) for
parameters in penalized models---one of the basic outputs we want from
a statistical analysis of a multifactorial system---is a current
research problem; statisticians have proposed a variety of methods
\cite{ref-potscherConfidence2010a,ref-javanmard_confidence_2014,ref-lockhart_significance_2014,ref-taylorPostselection2018},
but they typically make extremely
strong asymptotic assumptions and are far from being standard options in
software. Scientists should encourage their friends in statistics and
computer science to build tools that make penalized approaches easier to
use.

Statisticians derived confidence intervals for ridge regression long ago
\cite{ref-obenchain_classical_1977}---surprisingly, they are identical to the confidence
intervals one would have achieved from the full model without
penalization. Wang and Zhou \cite{ref-wangInterval2013b} similarly proved that model-averaging
CIs derived as suggested by Hjort and Claeskens \cite{ref-hjortFrequentist2003} are
asymptotically (i.e.,~for arbitrarily large data sets) equivalent to the
CIs from the full model. Analytical and simulation studies
\cite{ref-turek2012model,ref-fletcher2012model,ref-turek2013frequentist,ref-turek2015comparison,ref-kabaila_model-averaged_2016,ref-dormann_model_2018} have shown
that a variety of alternative methods for constructing CIs are
overoptimistic, i.e.,~that they generate too-narrow confidence intervals
with coverage lower than the nominal level. Simulations from several of
the studies above show that MMA confidence intervals constructed
according to the best known procedures typically include the true
parameter values only about 80\% or 90\% of the time. In particular,
Kabaila \emph{et al.} \cite{ref-kabaila_model-averaged_2016} say that constructing CIs that
take advantage of shrinkage but still achieve correct coverage will be
very difficult to achieve using model averaged confidence intervals.
(The only examples I have been able to find of MMA confidence intervals
with close to nominal coverage are from Chapter 5 of \cite{ref-burnham_model_2002}.) In short, it seems difficult to find model-averaged
confidence intervals that compete successfully with the standard
confidence interval based on the full model.

Free lunches do not exist in statistics, any more than anywhere else. We
can use penalized approaches to improve prediction accuracy without
having to sacrifice any input variables (by trading bias for variance),
but the only known way to gain statistical power for testing hypotheses,
or narrowing our uncertainty about our predictions, is to limit the
scope of our models {a priori} \cite{ref-harrell_regression_2001}, to add information
from pre-specified Bayesian priors (or equivalent regularization
procedures), or to collect more data. Burnham and Anderson \cite{ref-burnhamMultimodel2004b}
defined a ``savvy'' prior that reproduces the results of AIC-based
multimodel averaging in a Bayesian framework, but it is a weak
conceptual foundation for understanding multifactorial systems. Because
it is a prior on discrete models, rather than on the magnitude of
continuous parameters that describe the strength of different processes,
it induces a spike-and-slab type prior on parameters that assigns a
positive probability to the unrealistic case of a parameter being
exactly zero; furthermore, the prior will depend on the particular set
of models being considered.

Multimodel averaging is probably most popular in ecology (in May 2024,
Google Scholar returned \(\approx\) 65,000 hits for ``multimodel
averaging'' alone and 31,000 for ``multimodel averaging ecology'').
However, multifactorial systems---and the problems of approaching
inference through comparing and combining discrete models that consider
artificially limited subsets of the processes we know are operating ---
occur throughout the sciences of complexity, those involving biological
and human processes. In psychology, economics, sociology, epidemiology,
ecology, and evolution, every process that we can imagine has
{some} influence on the outcomes that we observe. Pretending that
some of these processes are completely absent can be a useful means to
an inferential or computational end, but it is rarely what we actually
believe about the system (although see \cite{ref-mundryIssues2011} for a
counterargument). We should not let this useful pretense become our
primary statistical focus.

If we have sensible scientific questions and good experimental designs,
muddling through with existing techniques will often provide reasonable
results \cite{ref-murtaugh_performance_2009}. But researchers should at least be aware that
the roundabout statistical methods they currently use to understand
multifactorial systems were designed for prediction, or the comparison
of discrete hypotheses, rather than for quantifying the relative
strength of simultaneously operating processes. When prediction is the
primary goal, penalized methods can work better (faster and with
better-understood statistical properties) than multimodel averaging.
When estimating the magnitude of effects or judging variable importance,
penalized or Bayesian methods may be appropriate---or we may have to
go back to the difficult choice of focusing on a restricted number of
variables for which we have enough to data to fit and interpreting the
full model.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\vspace{6pt} 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% optional
%\supplementary{The following supporting information can be downloaded at:  \linksupplementary{s1}, Figure S1: title; Table S1: title; Video S1: title.}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\funding{This research was funded by NSERC Discovery grants 2016-05488 and 2023-05400.}


\institutionalreview{Not applicable.}%MDPI: In this section, you should add the Institutional Review Board Statement and approval number, if relevant to your study. You might choose to exclude this statement if the study did not require ethical approval. Please note that the Editorial Office might ask you for further information. Please add “The study was conducted in accordance with the Declaration of Helsinki, and approved by the Institutional Review Board (or Ethics Committee) of NAME OF INSTITUTE (protocol code XXX and date of approval).” for studies involving humans. OR “The animal study protocol was approved by the Institutional Review Board (or Ethics Committee) of NAME OF INSTITUTE (protocol code XXX and date of approval).” for studies involving animals. OR “Ethical review and approval were waived for this study due to REASON (please provide a detailed justification).” OR “Not applicable” for studies not involving humans or animals.}
%% BMB: 'not applicable'

\dataavailability{There is no data in the article.}%MDPI: This section is necessary, please do not delete it. Also, please do not add “Not applicable”. You can add: option 1: The data presented in this study are available on request from the corresponding author. option 2: Data is contained within the article. Please confirm the suggested change. %% BMB: "Data are contained in the article" is incorrect/misleading, because there is no data in the article ...

\acknowledgments{Thanks %MDPI: Please ensure that all individuals included in this section have consented to the acknowledgement.
 to Jonathan Dushoff for conversations on these topics over many
years. Dana Karelus, Daniel Turek, and Jeff Walker provided useful
input: Noam Ross encouraged me to finally submit the paper; Tara Bolker
gave advice on straw men; three anonymous reviewers gave useful
feedback. This work was supported by multiple NSERC Discovery grants.
%MDPI: Please state the necessary software version number.
%% BMB: I removed the term 'epistemic uncertainty' because I had second thoughts about it, so the ref to Microsoft Copilot is no longer necessary
}

\conflictsofinterest{The author declares no conflicts of interest. The funder had no role in the writing of the manuscript or in the decision to publish the results. 
} 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Optional

%% Only for journal Encyclopedia
%\entrylink{The Link to this entry published on the encyclopedia platform.}

\abbreviations{Abbreviations}{
The following abbreviations are used in this manuscript:\\

\noindent 
\begin{tabular}{@{}ll}
  CI & confidence interval \\
  MMA & multi-model averaging \\
  NHST & null-hypothesis significance testing
\end{tabular}
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{adjustwidth}{-\extralength}{0cm}
%\printendnotes[custom] % Un-comment to print a list of endnotes

  \reftitle{References %MDPI: Reference Section have been completed layout and ready for publication. Please do not change it. Otherwise we cannot process to the next step. However, you may change the reference orders as necessary. But please make sure every reference should be cited (not missing) and Reference cited in the main text match the reference in the Reference List. Please provide the detailed information if required in the comments below. Or please provide the website links and accessed date (Day Month Year) if you cannot provide detailed information. Very Important: References are not allowed to be added or deleted without reasons after the manuscript is accepted.

    %% BMB: I have made minor changes where your changes were incorrect (e.g., authors' first and last names switched) or did not match your reference style (e.g., authors' names not initialized correctly). No references have been added or deleted.
.
}

% Please provide either the correct journal abbreviation (e.g. according to the “List of Title Word Abbreviations” http://www.issn.org/services/online-services/access-to-the-ltwa/) or the full name of the journal.
% Citations and References in Supplementary files are permitted provided that they also appear in the reference list here. 

%=====================================
% References, variant A: external bibliography
%=====================================
%\bibliography{your_external_BibTeX_file}

%=====================================
% References, variant B: internal bibliography
%=====================================
\begin{thebibliography}{999}

  \phantomsection\label{refs}

  \bibitem{ref-Gruner+2008}
Gruner, D.S.; Smith, J.E.; Seabloom, E.W.;  Sandin, S.A.; Ngai, J.T.; Hillebrand,  H.; Harpole, W.S.; Elser, J.J.; Cleland, E.E.; Bracken, M.E.; et al. A Cross-System Synthesis of Consumer and Nutrient Resource Control on Producer Biomass. 
\emph{Ecol. Lett.} \textbf{2008}, \emph{11},  740--755.

\bibitem{ref-brando_fire-induced_2012}
Brando, P.M.; Nepstad, D.C.; Balch, J.K.; Bolker, B.; Christman, M.C.; Coe, M.; Putz, F.E. Fire-Induced Tree Mortality in a
Neotropical Forest: The Roles of Bark Traits, Tree Size, Wood Density
and Fire Behavior.  \emph{Glob. Chang. Biol.} \textbf{2012}, \emph{18}, 630--641.
\url{https://doi.org/10.1111/j.1365-2486.2011.02533.x}.

\bibitem{ref-ghenu_multicopy_2016}
Ghenu, A-H.; Bolker, B.M.; Melnick, D.J.; Evans, B.J.  Multicopy Gene Family Evolution on Primate {Y}
Chromosomes.  \emph{BMC Genom.} \textbf{2016}, \emph{17}, 157.
\url{https://doi.org/10.1186/s12864-015-2187-8}.

\bibitem{ref-mcgill_why_2016}
  McGill, B. Why Ecology Is Hard (and Fun)---Multicausality.  \emph{Dynamic Ecology}; 2016. %MDPI: Please add the name of the publisher and their location. %% BMB: this is a blog post, there is no publisher.
Available online: \url{https://dynamicecology.wordpress.com/2016/03/02/why-ecology-is-hard-and-fun-multicausality/} (accessed on 2 June 2024). %MDPI: Please add the access date (format: Date Month Year), e.g., accessed on 1 January 2020. %%BMB: done.

\bibitem{ref-platt_strong_1964}
Platt, J.R. Strong {Inference}.  \emph{Science} \textbf{1964}, \emph{146},  347--353. %MDPI: We delete New Series, please confirm. %% BMB: confirmed.
\url{https://doi.org/10.2307/1714268}.

\bibitem{ref-fox_why_2016}
Fox, J. 2016. Why Don't More Ecologists Use Strong Inference?
\emph{Dynamic  Ecology}.  Available online: 
\url{https://dynamicecology.wordpress.com/2016/06/01/obstacles-to-strong-inference-in-ecology/} (blog post) (accessed on 2 June 2024). %MDPI: Please add the access date (format: Date Month Year), e.g., accessed on 1 January 2020. if it is journal, please add volume and page number %% BMB: blog post; access date added; see McGill reference

\bibitem{ref-betini_why_2017}
Betini, G.S.;  Avgar, T.; Fryxell, J.M.  Why Are We
Not Evaluating Multiple Competing Hypotheses in Ecology and
Evolution? \emph{R. Soc. Open Sci.} \textbf{2017}, \emph{4},  16056.
\url{https://doi.org/10.1098/rsos.160756}.

\bibitem{ref-taper_evidential_2015}
Taper, ML.; Ponciano, J.M. Evidential Statistics
as a Statistical Modern Synthesis to Support 21st Century Science. 
\emph{Popul. Ecol.} \textbf{2015}, \emph{58},  9--29.
\url{https://doi.org/10.1007/s10144-015-0533-y}.

\bibitem{ref-hastieElements2009}
Hastie, T.; Tibshirani, R.;  Friedman, J.H. \emph{The
Elements of Statistical Learning: Data Mining, Inference, and
Prediction};  Springer: New York, NY, USA, 2009.

\bibitem{ref-burnham_model_2002}
  Burnham, K.P.; Anderson, D.R.
  \emph{{Model} %MDPI: Please add the author name information. %% BMB: done.
 {Selection} and {Multimodel} {Inference}:
{A} {Practical} {Information}-Theoretic {Approach}}; Springer:  Berlin/Heidelberg, Germany, %Newly added information, please confirm %% BMB: confirmed
2002.

\bibitem{ref-moritzRole2023a}
Moritz, M.A.; Batllori, E.; Bolker, B.M.  The
Role of Fire in Terrestrial Vertebrate Richness Patterns. 
\emph{Ecol. Lett.} \textbf{2023}, \emph{26}, 563--574.
\url{https://doi.org/10.1111/ele.14177}.

\bibitem{ref-brewer_relative_2016}
Brewer, M.J.;  Butler, A.; Cooksley, S.L. The
Relative Performance of {AIC}, {AICC} and {BIC} in the Presence of
Unobserved Heterogeneity.  \emph{Methods Ecol. Evol.} \textbf{2016}, \emph{7}, 679--692. \url{https://doi.org/10.1111/2041-210X.12541}.

\bibitem{ref-mundryIssues2011}
Mundry, R. Issues in Information Theory-Based Statistical
Inference---A Commentary from a Frequentist's Perspective. 
\emph{Behav. Ecol. Sociobiol.} \textbf{2011}, \emph{65},  57--68.

\bibitem{ref-cade_model_2015}
Cade, B.S. Model Averaging and Muddled Multimodel
Inference. \emph{Ecology} \textbf{2015}, \emph{96}, 2370--2382. \url{https://doi.org/10.1890/14-1639.1}.

\bibitem{ref-bernhardtInterpretation1979}
Bernhardt, I.; Jung, B.S.  The Interpretation of Least
Squares Regression with Interaction or Polynomial Terms.  \emph{The
Review Econ. Stat.} \textbf{1979}, \emph{61}, 481--483.
\url{https://doi.org/10.2307/1926085}.

\bibitem{ref-jones_sensible_2000}
Jones, L.V.;  Tukey, J.W.  A {Sensible} {Formulation}
of the {Significance} {Test}.  \emph{Psychol. Methods} \textbf{2000}, \emph{5}, 
411--414. \url{https://doi.org/10.1037//1082-989X.5.4.411}.

\bibitem{ref-dushoff_i_2019}
Dushoff, J.; Kain, M.P.; Bolker, B.M.  I
Can See Clearly Now: {Reinterpreting} Statistical Significance. 
\emph{Methods Ecol. Evol.} \textbf{2019}, \emph{10},  756--759.
\url{https://doi.org/10.1111/2041-210X.13159}.

\bibitem{ref-mengStatistical2018}
Meng, X.  Statistical Paradises and Paradoxes in Big Data
({I}): {Law} of Large Populations, Big Data Paradox, and the 2016 {US}
Presidential Election.  \emph{Ann. Appl. Stat.} \textbf{2018}, \emph{12},
685--726. \url{https://doi.org/10.1214/18-AOAS1161SF}.

\bibitem{ref-harrell_regression_2001}
Harrell, F.  \emph{Regression Modeling Strategies};
Springer:  Berlin/Heidelberg, Germany, %Newly added information, please confirm %% BMB: confirmed
2001

\bibitem{ref-romanoStepwise2005}
Romano, J.P.;  Wolf, M. Stepwise {Multiple
Testing} as {Formalized Data Snooping}.  \emph{Econometrica} \textbf{2005}, 73, 
1237--1282. \url{https://doi.org/10.1111/j.1468-0262.2005.00615.x}.

\bibitem{ref-whittingham_why_2006}
Whittingham, M.J.;  Stephens, P.A.;  Bradbury, R.B.;  Freckleton, R.P.  Why Do We Still Use Stepwise Modelling in
Ecology and Behaviour? \emph{J. Anim. Ecol.} \textbf{2006}, \emph{75}, 
1182--1189. \url{https://doi.org/10.1111/j.1365-2656.2006.01141.x}.

\bibitem{ref-mundryStepwise2009}
Mundry, R.; Nunn, C.L.  Stepwise {Model Fitting}
and {Statistical Inference}: {Turning Noise} into {Signal Pollution}. 
\emph{ Am. Nat.} \textbf{2009}, \emph{173}, 119--123.
\url{https://doi.org/10.1086/593303}.

\bibitem{ref-freckleton_dealing_2011}
Freckleton, R.P. Dealing with Collinearity in Behavioural
and Ecological Data: Model Averaging and the Problems of Measurement
Error.  \emph{Behav. Ecol. Sociobiol.} \textbf{2011}, \emph{65},  91--101.

\bibitem{ref-galipaud_ecologists_2014}
Galipaud, M.; Gillingham, M.A.F.; David, M.; Dechaume-Moncharmo, F.X. Ecologists Overestimate
the Importance of Predictor Variables in Model Averaging: A Plea for
Cautious Interpretations.  \emph{Methods Ecol. Evol.} \textbf{2014}, \emph{5}, 983--991. \url{https://doi.org/10.1111/2041-210X.12251}.

\bibitem{ref-zhang_model_2015}
Zhang, X.;  Zou, G.;  Carroll, R.J. Model
Averaging Based on {Kullback}-{Leibler} Distance.  \emph{Stat.
Sin.} \textbf{2015},  \emph{25}, 1583--1598. \url{https://doi.org/10.5705/ss.2013.326}.

\bibitem{ref-grahamConfronting2003}
Graham, M.H.  Confronting {Multicollinearity} in
{Ecological Multiple Regression}.  \emph{Ecology} \textbf{2003}, \emph{84},  2809--2815.
\url{https://doi.org/10.1890/02-3114}.

\bibitem{ref-morrisseyMultiple2018}
Morrissey, M.B.; Ruxton, G.D.  Multiple
{Regression Is Not Multiple Regressions}: {The Meaning} of {Multiple
Regression} and the {Non-Problem} of {Collinearity}. 
\emph{Philos. Theory Pract. Biol.} \textbf{2018}, \emph{10}, 3.
https://doi.org/\url{http://dx.doi.org/10.3998/ptpbio.16039257.0010.003}.

\bibitem{ref-fengCollinearity2019a}
Feng, X.; Park, S. D.; Liang, Y.; Pandey, R.; Papeş, M. Collinearity in Ecological Niche Modeling: {Confusions} and
Challenges.  \emph{Ecol. Evol.} \textbf{2019},  \emph{9}, 10365--10376.
\url{https://doi.org/10.1002/ece3.5555}.

\bibitem{ref-vanhoveCollinearity2021}
Vanhove, J. Collinearity Isn't a Disease That Needs
Curing.  \emph{Meta-Psychology} \textbf{2021}, \emph{5}, 1--11.
\url{https://doi.org/10.15626/MP.2021.2548}.

\bibitem{ref-BurnAnde98}
Burnham, K.P.;  Anderson, D.R. \emph{Model Selection
and Inference: A Practical Information-Theoretic Approach}; Springer: New York, NY, USA, 1998.

\bibitem{ref-johnsonModel2004a}
Johnson, J.B.;  Omland, K.S. Model Selection in
Ecology and Evolution.  \emph{Trends Ecol. Evol.} \textbf{2004}, \emph{19}, 
101--108. \url{https://doi.org/10.1016/j.tree.2003.10.013}.

\bibitem{ref-chamberlinMethod1890}
Chamberlin, T.C. The Method of Multiple Working Hypotheses. 
\emph{Science} \textbf{1890}, \emph{15},  92--96.
\url{https://doi.org/10.1126/science.ns-15.366.92}.

\bibitem{ref-gelmanPhilosophy2013}
Gelman, A.; Shalizi, C.R. Philosophy and the
Practice of {Bayesian} Statistics.  \emph{Br. J. Math. Stat. Psychol.} \textbf{2013}, \emph{66}, 8--38.
\url{https://doi.org/10.1111/j.2044-8317.2011.02037.x}.

\bibitem{ref-bernardoBayesian1994}
Bernardo, J.M.;  Smith, A.F.M. \emph{Bayesian
{Theory}}, 1st ed.; {John Wiley \& Sons, Ltd.}:  Hoboken, NJ, USA, %Newly added information, please confirm. %% BMB: confirmed
1994.
\url{https://doi.org/10.1002/9780470316870}.

\bibitem{ref-barker_truth_2015}
Barker, R.J.;  Link, W.A. Truth, Models, Model
Sets, {AIC}, and Multimodel Inference: {A} {Bayesian} Perspective.
\emph{ J. Wildl. Manag.} \textbf{2015}, \emph{79}, 730--738.
\url{https://doi.org/10.1002/jwmg.890}.

\bibitem{ref-poncianoMultimodel2018}
Ponciano, J.; Mark L. Taper. Multi-Model Inference
Through Projections in Model Space.  \emph{arXiv},  \textbf{2018}, 	arXiv:1805.08765.

\bibitem{ref-luttbeg_comparing_2004}
Luttbeg, B.; Langen, T.A.  Comparing alternative models to empirical data: Cognitive models of western scrub-jay foraging behavior. \emph{ Am. Nat.} \textbf{2004}, \emph{163}, 263--276.
\url{https://doi.org/10.1086/381319}.

\bibitem{ref-dahlgren_alternative_2010}
Dahlgren, J.P.  {Alternative Regression Methods Are Not
Considered in {Murtaugh} (2009) or by Ecologists in General.}
\emph{Ecol. Lett.} \textbf{2010}, \emph{13},  E7--E9.
\url{https://doi.org/10.1111/j.1461-0248.2010.01460.x}.

\bibitem{ref-lemoineMoving2019a}
Lemoine, N.P. Moving Beyond Noninformative Priors: Why and
How to Choose Weakly Informative Priors in {Bayesian} Analyses. 
\emph{Oikos} \textbf{2019}, \emph{128}, 912--928. \url{https://doi.org/10.1111/oik.05985}.

\bibitem{ref-rasmussenGaussian2005}
Rasmussen, C.E.;  Williams, C.K.
\emph{Gaussian {Processes} for {Machine Learning}}; The
MIT Press: Cambridge, MA, USA, 2015.

\bibitem{ref-woodGeneralized2017}
Wood, S.N. \emph{Generalized Additive Models: An Introduction
with {R}}; {CRC Texts} in {Statistical Science}; Chapman \& Hall: London, UK, 2017. %% BMB: confirmed

\bibitem{ref-chandrashekarSurvey2014}
Chandrashekar,  G.; Sahin, F. A Survey on Feature
Selection Methods.  \emph{Comput. Electr. Eng.} \textbf{2014}, \emph{40}, 
16--28. \url{https://doi.org/10.1016/j.compeleceng.2013.11.024}.

\bibitem{ref-koenkerQuantile2017}
Koenker, R. Quantile {Regression}: 40 {Years on}. 
\emph{Annu. Rev. Econ.} \textbf{2017}, \emph{9},  155--176.
\url{https://doi.org/10.1146/annurev-economics-063016-103651}.

\bibitem{ref-wenger_assessing_2012}
Wenger, S.J.; Olden, J.D.  Assessing Transferability
of Ecological Models: An Underappreciated Aspect of Statistical
Validation.  \emph{Methods Ecol. Evol.} \textbf{2012}, \emph{3},  260--267.
\url{https://doi.org/10.1111/j.2041-210X.2011.00170.x}.

\bibitem{ref-robertsCrossvalidation2016}
Roberts, D.R.; Bahn, V.; Ciuti, S.; Boyce, M.S.; Elith, J.; Guillera-Arroita, G.; Hauenstein, S.; Lahoz-Monfort, J.J.; Schröder, B.; Thuiller, W.; et al. Cross-Validation Strategies for Data with Temporal, Spatial,
Hierarchical, or Phylogenetic Structure.  \emph{Ecography} \textbf{2016}, \emph{40}, 
913--929. \url{https://doi.org/10.1111/ecog.02881}.

\bibitem{ref-chungNondegenerate2013}
Chung, Y; Rabe-Hesketh, S.; Dorie, V.; Gelman, A.; Liu, J.  {A {Nondegenerate Penalized Likelihood Estimator}
for {Variance Parameters} in {Multilevel Models}.}
\emph{Psychometrika} \textbf{2013}, \emph{78},  685--709.
\url{https://doi.org/10.1007/s11336-013-9328-2}.

\bibitem{ref-potscherConfidence2010a}
Pötscher, B.M.;  Schneider, U. Confidence Sets
Based on Penalized Maximum Likelihood Estimators in {Gaussian}
Regression.  \emph{Electron. J. Stat.} \textbf{2010}, \emph{4}, 334--360. \url{https://doi.org/10.1214/09-EJS523}.

\bibitem{ref-javanmard_confidence_2014}
Javanmard, A.; Montanari, A. Confidence Intervals and
Hypothesis Testing for High-Dimensional Regression.  \emph{ J.
 Mach. Learn. Res.} \textbf{2014}, \emph{15}, 2869--2909.
\url{http://dl.acm.org/citation.cfm?id=2697057}.

\bibitem{ref-lockhart_significance_2014}
Lockhart, R.; Taylor, J.;  Tibshirani, R.J.; Tibshirani, R. A Significance Test for the Lasso.  \emph{Ann.
 Stat.} \textbf{2014}, \emph{42}, 413.
\url{http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4285373/}.

\bibitem{ref-taylorPostselection2018}
Taylor, J.;  Tibshirani, R.  Post-Selection
Inference for {L1-penalized} Likelihood Models.  \emph{Can.
J. Stat.s} \textbf{2018}, \emph{46},  41--61.
\url{https://doi.org/10.1002/cjs.11313}.

\bibitem{ref-obenchain_classical_1977}
Obenchain, R.  Classical \(F\)-Tests and Confidence Regions for
Ridge Regression.  \emph{Technometrics} \textbf{1977}, \emph{19}, 429--439.

\bibitem{ref-wangInterval2013b}
Wang, H.;   Zhou, S.Z.F. Interval Estimation by
Frequentist Model Averaging.  \emph{Commun. Stat.-Theory \& Methods } \textbf{2013}, \emph{42},  4342--4356.
\url{https://doi.org/10.1080/03610926.2011.647218}.

\bibitem{ref-hjortFrequentist2003}
Hjort, N.L.; Claeskens, G.  Frequentist {Model
Average Estimators}.  \emph{J. Am. Stat.
Assoc.} \textbf{2003}, \emph{98}, 879--899.
\url{https://doi.org/10.1198/016214503000000828}.

\bibitem{ref-turek2012model}
Turek,  D.; Fletcher, D. Model-Averaged {Wald}
Confidence Intervals.  \emph{Comput. Stat. Data
Anal.} \textbf{2012}, \emph{56},  2809--2815.
\url{https://doi.org/10.1016/j.csda.2012.03.002}.

\bibitem{ref-fletcher2012model}
Fletcher, D.; Turek, D. Model-Averaged Profile
Likelihood Intervals.  \emph{J. Agric. Biol.  
Environ. Stat.} \textbf{2012}, \emph{17},  38--51.

\bibitem{ref-turek2013frequentist}
Turek, D.B. Frequentist Model-Averaged Confidence
Intervals.  Ph.D Thesis, University of Otago, Dunedin, New Zealand, 2013. %% BMB: confirmed
\url{https://www.otago.ourarchive.ac.nz/bitstream/handle/10523/3923/TurekDanielB2013PhD.pdf}.

\bibitem{ref-turek2015comparison}
Turek, D.  Comparison of the Frequentist {MATA} Confidence
Interval with {Bayesian} Model-Averaged Confidence Intervals. 
\emph{J. Probab. Stat.} \textbf{2015}, \emph{2015}, 420483 .
\url{https://doi.org/10.1155/2015/420483}.

\bibitem{ref-kabaila_model-averaged_2016}
Kabaila, P.; Welsh, A.H.;  Abeysekera, W.
Model-{Averaged} {Confidence} {Intervals}.  \emph{Scand.
J. Stat.} \textbf{2016}, \emph{43},  35--48.
\url{https://doi.org/10.1111/sjos.12163}.

\bibitem{ref-dormann_model_2018}
Dormann, C.F.; Calabrese, J.M.; Guillera-Arroita, G.; Matechou, E.; Bahn, V.; Bartoń, K.; Beale, C.M.; Ciuti, S.; Elith, J.; Gerstner, K.; et al. Model Averaging in Ecology: A Review of {Bayesian},
Information-Theoretic and Tactical Approaches for Predictive
Inference. \emph{Ecol. Monogr.} \textbf{2018}, \emph{88}, 485--504.
\url{https://doi.org/10.1002/ecm.1309}.

\bibitem{ref-burnhamMultimodel2004b}
Burnham, K.P.; Anderson, D.R. {Multimodel Inference: Understanding {AIC} and {BIC}
in Model Selection}. \emph{Sociol. Methods Res.} \textbf{2004}, 33, 
261--304. \url{https://doi.org/10.1177/0049124104268644}.


\bibitem{ref-murtaugh_performance_2009}
Murtaugh, P.A. Performance of Several Variable-Selection
Methods Applied to Real Ecological Data.  \emph{Ecol. Lett.} \textbf{2009}, \emph{12}, 1061--1068. \url{https://doi.org/10.1111/j.1461-0248.2009.01361.x}.

\end{thebibliography}

% If authors have biography, please use the format below
%\section*{Short Biography of Authors}
%\bio
%{\raisebox{-0.35cm}{\includegraphics[width=3.5cm,height=5.3cm,clip,keepaspectratio]{Definitions/author1.pdf}}}
%{\textbf{Firstname Lastname} Biography of first author}
%
%\bio
%{\raisebox{-0.35cm}{\includegraphics[width=3.5cm,height=5.3cm,clip,keepaspectratio]{Definitions/author2.jpg}}}
%{\textbf{Firstname Lastname} Biography of second author}

% For the MDPI journals use author-date citation, please follow the formatting guidelines on http://www.mdpi.com/authors/references
% To cite two works by the same author: \citeauthor{ref-journal-1a} (\citeyear{ref-journal-1a}, \citeyear{ref-journal-1b}). This produces: Whittaker (1967, 1975)
% To cite two works by the same author with specific pages: \citeauthor{ref-journal-3a} (\citeyear{ref-journal-3a}, p. 328; \citeyear{ref-journal-3b}, p.475). This produces: Wong (1999, p. 328; 2000, p. 475)

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% for journal Sci
%\reviewreports{\\
%Reviewer 1 comments and authors’ response\\
%Reviewer 2 comments and authors’ response\\
%Reviewer 3 comments and authors’ response
%}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\PublishersNote{}
\end{adjustwidth}
\end{document}