Merge pull request #579 from ryankelly-uiuc/lastminutetweaks

Last minute tweaks
PecanProject · Jul 22, 2015 · 675c8e8 · 675c8e8
2 parents 5e0c818 + 3bf3b59
commit 675c8e8
Show file tree

Hide file tree

Showing 8 changed files with 208 additions and 4 deletions.
diff --git a/modules/assim.batch/R/pda.mcmc.R b/modules/assim.batch/R/pda.mcmc.R
@@ -45,7 +45,9 @@ pda.mcmc <- function(settings, params.id=NULL, param.names=NULL, prior.id=NULL,
   }
 
   ## Load priors
-  prior <- pda.load.priors(settings, con)
+  temp <- pda.load.priors(settings, con)
+  prior <- temp$prior
+  settings <- temp$settings
   pname <-  rownames(prior) 
   n.param.all  <- nrow(prior)
 
@@ -119,6 +121,10 @@ pda.mcmc <- function(settings, params.id=NULL, param.names=NULL, prior.id=NULL,
       showWarnings=F, recursive=T)
   }
 
+  ## save updated settings XML. Will be overwritten at end, but useful in case of crash
+  saveXML(listToXml(settings, "pecan"), file=file.path(settings$outdir, 
+    paste0('pecan.pda', settings$assim.batch$ensemble.id, '.xml')))
+
   ## --------------------------------- Main MCMC loop --------------------------------- ##
   for(i in start:finish){
     logger.info(paste("Data assimilation MCMC iteration",i,"of",finish))
@@ -127,6 +133,10 @@ pda.mcmc <- function(settings, params.id=NULL, param.names=NULL, prior.id=NULL,
     if(i %% settings$assim.batch$jump$adapt < 1){
       settings <- pda.adjust.jumps(settings, accept.rate, pnames=pname[prior.ind])
       accept.rate <- numeric(n.param)
+
+      # Save updated settings XML. Will be overwritten at end, but useful in case of crash
+      saveXML(listToXml(settings, "pecan"), file=file.path(settings$outdir, 
+        paste0('pecan.pda', settings$assim.batch$ensemble.id, '.xml')))
     }
 
     for(j in 1:n.param){

diff --git a/modules/assim.batch/R/pda.mcmc.recover.R b/modules/assim.batch/R/pda.mcmc.recover.R
@@ -0,0 +1,84 @@
+##' Clean up a failed PDA run
+##'
+##' @title Clean up a failed PDA run
+##' @param all params are the identically named variables in pda.mcmc / pda.emulator
+##'
+##' @return An updated settings list
+##'
+##' @author Ryan Kelly
+##' @export
+
+# This is just a quick kludgey version, that relies on temporary files to recover a failed pda.mcmc() call. It writes all outputs based on whatever runs were done, and returns the same updated settings list that would have been returned if the run completed. So, recover like this:
+#
+# read.settings(path/to/original/settings/file)
+# settings$assim.batch <- pda.mcmc.recover(settings) # wrap up unfinished run
+# settings$assim.batch <- pda.mcmc(settings) # start new pda
+pda.mcmc.recover <- function(settings, params.id=NULL, param.names=NULL, prior.id=NULL, chain=NULL, 
+                     iter=NULL, adapt=NULL, adj.min=NULL, ar.target=NULL, jvar=NULL, n.knot=NULL) {
+
+  if(FALSE){
+    params.id <- param.names <- prior.id <- chain <- iter <- NULL 
+    n.knot <- adapt <- adj.min <- ar.target <- jvar <- NULL
+  }
+
+  require(coda)
+
+  ## Handle settings
+    settings <- pda.settings(
+                  settings=settings, params.id=params.id, param.names=param.names, 
+                  prior.id=prior.id, chain=chain, iter=iter, adapt=adapt, 
+                  adj.min=adj.min, ar.target=ar.target, jvar=jvar, n.knot=n.knot)
+
+  ## Open database connection
+  if(settings$database$bety$write){
+    con <- try(db.open(settings$database$bety), silent=TRUE)
+    if(is.character(con)){
+      con <- NULL
+    }
+  } else {
+    con <- NULL
+  }
+
+  ## Load priors
+  prior <- pda.load.priors(settings, con)$prior
+  pname <-  rownames(prior) 
+  n.param.all  <- nrow(prior)
+
+  # Get start and finish
+  params.dummy <- pda.init.params(settings, con, pname, n.param.all)
+    start  <- params.dummy$start
+    finish <- params.dummy$finish
+
+  ## Select parameters to constrain
+  prior.ind <- which(rownames(prior) %in% settings$assim.batch$param.names)
+  n.param <- length(prior.ind)
+
+  ## Get the workflow id
+  if ("workflow" %in% names(settings)) {
+    workflow.id <- settings$workflow$id
+  } else {
+    workflow.id <- -1
+  }
+
+  ## Get ensemble id from diagnostic plot dir
+  ens.ids <- as.numeric(sub("diag.pda", "", dir(settings$outdir, "diag.pda")))
+  settings$assim.batch$ensemble.id <- as.character(max(ens.ids))
+
+
+  ## Load up temp file to recreate params
+  params = as.matrix(read.table(file.path(settings$outdir, "pda.mcmc.txt")))
+  colnames(params) <- pname
+
+  ## Update iters
+  settings$assim.batch$iter <- finish - nrow(params)
+
+  ## Save outputs to plots, files, and db
+  settings <- pda.postprocess(settings, con, params, pname, prior, prior.ind)
+
+  ## close database connection
+  if(!is.null(con)) db.close(con)
+
+  ## Output an updated settings list
+  return(settings$assim.batch)
+
+} ## end pda.mcmc
diff --git a/modules/assim.batch/R/pda.utils.R b/modules/assim.batch/R/pda.utils.R
@@ -275,7 +275,7 @@ pda.load.priors <- function(settings, con) {
       "] but no prior found!"))
   }
 
-  return(prior.out)
+  return(list(prior=prior.out, settings=settings))
 }
 
 

diff --git a/modules/assim.batch/vignettes/AssimBatchVignette.Rmd b/modules/assim.batch/vignettes/AssimBatchVignette.Rmd
@@ -0,0 +1,93 @@
+---
+title: "PEcAn.assim.batch Vignette"
+author: "Ryan Kelly"
+date: "July 12, 2015"
+output: html_document
+---
+
+## install package from Github
+
+Only needs to be done the first time
+```{r}
+library(devtools)
+install_github("PecanProject/pecan",subdir="all")
+```
+
+
+## Add <assim.batch> tags to pecan.xml
+
+The easiest way to run PEcAn's parameter data assimilation is to add an `<assim.batch>` block to pecan.xml, load the file with `read.settings`, and pass the resulting settings object to `pda.mcmc()`. Here is an example `<assim.batch>` block:
+
+```
+<assim.batch>
+  <iter>100</iter>
+  <prior>
+    <path>/path/to/(prior/post).distns.Rdata</path>
+  </prior>
+  <param.names>
+    <param>Amax</param>
+  </param.names>
+  <inputs>
+    <file>
+      <id>1000000358</id>
+      <format>Ameriflux.L4</format>
+      <data.model>
+        <likelihood>Laplace</likelihood>
+        <variable.id>297</variable.id>
+      </data.model>
+    </file>
+  </inputs>
+  <jump>
+    <ar.target>0.5</ar.target>
+    <adapt>25</adapt>
+    <jvar>
+      <jvar>0.1</jvar>
+    </jvar>
+    <adj.min>0.1</adj.min>
+  </jump>
+  <diag.plot.iter>25</diag.plot.iter>
+</assim.batch>
+```
+
+
+Here are details about the settings:
+
+
+* `<iter>` Specifies the number of MCMC iterations to run. If continuing a previous MCMC, this is the number of additional iterations, which will be added to the previous total. Defaults to 100 if missing. Ignored by pda.emulator(). 
+* `<prior>` Identifies the prior to be used for PDA. Can be one of either:
+  + `<posterior.id>` A posterior ID in BETY specifying the posterior from a previous PEcAn analysis (e.g., meta-analysis or previous PDA) to be used as the prior for PDA. Defaults to the most recent relevant posterior in the database if omitted (and no `<path>` specified instead; see below).
+  + `<path>` As an alternative to using a posterior ID, can specify a file path to either a `prior.distns.Rdata` or `post.distns.Rdata` file generated from an earlier analysis. Conceptually, using a posterior distribution as the prior for PDA is preferred, as this allows the multiple analyses to work together to iteratively constrain parameters. In practice, previous analyses may have over-constrained parameters to ranges that do not actually optimize model outputs, so using a less informative prior for PDA might yield better results. 
+* `<param.names>` The names of parameters to be constrained by assimilation, listed in individual `<param>` tags. These must be the standard names given by the trait dictionary, i.e.:
+
+  ```
+  data(trait.dictionary, package = "PEcAn.utils")
+  trait.dictionary[,c("id", "figid")]
+  ```
+
+* `<inputs>` Observation data to be compared to the model. In principle, can be one or more datasets, specified in a variety of ways. In practice, the code is only tested for a single dataset currently, and assumes the input is Ameriflux NEE. 
+  + `<file>` Denotes a set of tags for a single input. Would be repeated for multiple datasets once that functionality is supported. 
+    + `<id>` BETY input ID for looking up the input. Will be used preferentially over `<path>` or `<source>` (below). 
+    + `<path>` File path to the input. Only used if no `<id>` is given. 
+    + `<source>` A standardized source of input data (e.g., Ameriflux). Not implemented yet, but the idea would be similar to the met workflow, PEcAn would be able to use standard data sources automatically where available. Only used if no `<id>` or `<path>` is given.
+    + `<format>` An identifier to tell PEcAn how to handle the input. Currently allows either `Ameriflux.L2` or `Ameriflux.L4`. 
+    + `<data.model>` Block for specifying the data model to be used with this dataset. Currently ignored since all data are assumed to be Ameriflux NEE and treated identically.
+      + `<likelihood>` Identifier for the likelihood to use. E.g., the Ameriflux data use a Laplacian likelihood. 
+      + `<variable.id>` The BETY variable ID associated with this dataset. The idea is that specific preprocessing steps (e.g., estimating heteroskedastic error for tower NEE) would be associated with particular IDs. Could automate further by assigning default `<likelihood>` to variable.id values (allowing `<likelihood>` to be omitted from pecan.xml). And/or could add a separate tag `<preprocess>` that could specify a function to override any default associated with the variable ID.
+* `<jump>` 
+  + `<ar.target>` Target acceptance rate for the adaptive jump algorithm. Defaults to 0.5 if missing. 
+  + `<adapt>` Number of iterations between jump variance adaptations. Defaults to `floor(iter/10)` if missing. 
+  + `<jvar>` Initial jump variances for proposing parameter values, listed in individual `<jvar>` tags (one for each `<param>` specified in `<param.names>`). Will be adjusted adaptively. Defaults to 1/10 the prior variance if missing.
+  + `<adj.min>` Minimum factor by which to reduce jump variance when adapting. Prevents jump variances from degenerating to 0. Defaults to 0.1 if missing. 
+* `<diag.plot.iter>` Interval between saving diagnostic plots. Omit or set to NULL to skip them.
+* `<params.id>` (Not shown.) A BETY dbfile ID for an MCMC output from previous PDA. If specified, that file is loaded, the new MCMC starts from the last parameter values of the previous, and when finished the extended chain is saved as a new output. If missing, then MCMC starts fresh from prior median parameter values. Regardless, the MCMC parameter values of the PDA are saved to file and inserted in BETY, and the new dbfile ID is inserted into `<params.id>`. The `pda.mcmc()` funtion returns the `<assim.batch>` settings, which can then be saved. Then, calling a new round of PDA using these returned settings will automatically continue the previous MCMC. 
+
+
+
+## Run PDA
+
+
+
+
+
+
+
diff --git a/modules/uncertainty/R/run.ensemble.analysis.R b/modules/uncertainty/R/run.ensemble.analysis.R
@@ -172,7 +172,7 @@ read.ensemble.ts <- function(model, ensemble.id=NULL, variable=NULL, start.year=
   # Can specify ensemble ids manually. If not, look in settings. If none there, just look in samples.Rdata, which for backwards compatibility still contains the sample info for (the most recent)  sensitivity and ensemble analysis combined.
   if(!is.null(ensemble.id)) {
     fname <- ensemble.filename(settings, "ensemble.samples", "Rdata", 
-               ensemble.id=ens.ensemble.id, all.var.yr=TRUE)
+               ensemble.id=ensemble.id, all.var.yr=TRUE)
   } else if(!is.null(settings$ensemble$ensemble.id)) {
     ensemble.id <- settings$ensemble$ensemble.id
     fname <- ensemble.filename(settings, "ensemble.samples", "Rdata", 

diff --git a/scripts/workflow.pda.recover.r b/scripts/workflow.pda.recover.r
@@ -0,0 +1,15 @@
+library(PEcAn.all)
+
+# Insert settings file path here. Use the same one supplied to the original failed workflow
+settings.file = ""
+
+# Read in settings
+settings <- read.settings(settings.file)
+
+# Clean and wrap up failed PDA MCMC
+settings$assim.batch <- pda.mcmc.recover(settings)
+
+# If desired, now proceed to complete the run
+status.start("PDA")
+settings$assim.batch <- pda.mcmc(settings)
+status.end()
diff --git a/utils/R/run.write.configs.R b/utils/R/run.write.configs.R
@@ -80,6 +80,9 @@ run.write.configs <- function(settings, write = TRUE, ens.sample.method="halton"
     if(!is.na(posterior.files[i])) {
       # Load specified file
       load(file.path(outdirs[i], posterior.files[i]))
+      if(!exists('prior.distns') & exists('post.distns')) {
+        prior.distns <- post.distns
+      }
     } else {
       # Default to most recent posterior in the workflow, or the prior if there is none
       fname = file.path(outdirs[i], 'post.distns.Rdata')

diff --git a/web/workflow.R b/web/workflow.R
@@ -191,7 +191,6 @@ if (check.status("SENSITIVITY") == 0){
 if(('assim.batch' %in% names(settings))) {
   status.start("PDA")
   settings$assim.batch <- pda.mcmc(settings)
-  saveXML(listToXml(settings, "pecan"), file=file.path(settings$outdir, 'pecan.pda.xml'))
   status.end()
 }