## ----results = "asis", echo = FALSE------------------------------------------- # output format should be of the form #> output #> output knitr::opts_chunk$set(collapse = TRUE, comment = "#>") # initialize: load library, make everything deterministic library("mlrCPO") set.seed(123) # get the path of the parent document # path = names(knitr::opts_knit$get("encoding"))[1] base = knitr::opts_knit$get("output.dir") file = sys.frame(min(grep("^knitr::knit$|^knit$", sapply(sys.calls(), function(x) as.character(x)[1]))))$input file = basename(file) path = file.path(base, file) rpath = gsub("\\.[^.]*$", ".R", path) # strip whitespace from lines in tangle (R file) output for lintr knitr::knit_hooks$set(document = function(x) { if (file_test("-f", rpath)) { lines = readLines(rpath) lines = gsub(" *(\n|$)", "\\1", lines) cat(lines, file = rpath, sep = "\n", append = FALSE) } x }) ############################# # do the trans-vignette ToC # ############################# fullfile = file allfiles = list.files(path = base, pattern = ".*\\.Rmd$") stopifnot(file %in% allfiles) # collect information (title, url, main / compact) for each file in vignette dir fileinfolist = list() for (cf in allfiles) { ismain = TRUE if (grepl("^z_", cf)) { infoslot = gsub("^z_", "", cf) infoslot = gsub("_terse\\.Rmd$", "", infoslot) subslot = "compact" } else { infoslot = gsub("^a_", "", cf) infoslot = gsub("\\.Rmd$", "", infoslot) subslot = "main" } content = scan(paste(base, cf, sep = "/"), what = "character", quiet = TRUE) pos = min(c(which(content == "title:"), Inf)) if (is.infinite(pos)) { stop(sprintf("parsing error: %s", cf)) } infolist = list(title = content[pos + 1], url = cf, iscurrent = cf == file) applist = list(infolist) names(applist) = subslot fileinfolist[[infoslot]] = c(fileinfolist[[infoslot]], applist) } # helper function that creates a link for all files except the current one linkify = function(info, title) { if (info$iscurrent) { title } else { sprintf("[%s](%s)", title, gsub("\\.Rmd$", ".html", info$url)) } } # output ToC for (idx in seq_along(fileinfolist)) { content = fileinfolist[[sort(names(fileinfolist))[idx]]] if (!is.null(content$compact)) { if (paste(sub("[0-9]\\. ", "", content$main$title), "(No Output)") != sub("^z ", "", content$compact$title)) { stop(sprintf("File %s and its compact version %s have incompatible titles\nThe compact version must be paste(main_title, \"(No Output)\"). Is: '%s', expected: '%s'", content$main$url, content$compact$url, content$compact$title, paste(content$main$title, "(No Output)"))) } line = sprintf("%s (%s)", linkify(content$main, content$main$title), linkify(content$compact, "compact version")) } else { line = linkify(content$main, content$main$title) } cat(sprintf("%s. %s\n", idx, line)) if (content$main$iscurrent || content$compact$iscurrent) { fullfile = content$main$url } } fullpath = file.path(base, fullfile) ############################# # Optional Document TOC # ############################# # print everything up to level `print.level`. # level is the number of '#' prefixes. The lowest level is usually 2. printToc = function(print.level = 3) { owncontent = readLines(fullpath) tripletic = grepl("^```", owncontent) owncontent = owncontent[cumsum(tripletic) %% 2 == 0] # exclude ```-delimited code headlines = grep("^#+ +", owncontent, value = TRUE) headlevels = nchar(gsub(" .*", "", headlines)) headlines = gsub("^[#]+ +", "", headlines) links = gsub("[^-a-z. ]", "", tolower(headlines)) links = gsub(" +", "-", links) links = gsub("-$", "", links) if (!sum(headlevels <= print.level)) { return(invisible(NULL)) } cat("Table of Contents\n
\n", sep = "") lastlevel = headlevels[1] - 1 for (idx in seq_along(headlines)) { line = headlines[idx] level = headlevels[idx] link = links[idx] if (level > print.level) { next } if (level < headlevels[1]) { stop("First headline level must be the lowest one used, but '", line, "' is lower.") } lvldiff = level - lastlevel if (lvldiff > 1) { stop("Cannot jump headline levels. Error on: ", line) } if (lvldiff > 0) { # higher level -> open a
\n") } ############################# # Some output settings # ############################# options(width = 80) replaceprint = function(ofunc) { force(ofunc) function(x, ...) { cu = capture.output({ret = ofunc(x, ...)}) cu = grep("time: [-+e0-9.]{1,6}", cu, value = TRUE, invert = TRUE) cat(paste(cu, collapse = "\n")) if (!grepl("\n$", tail(cu, 1))) { cat("\n") } ret } } for (pfunc in grep("print\\.", ls(asNamespace("mlr")), value = TRUE)) { ofunc = get(pfunc, asNamespace("mlr")) assign(pfunc, replaceprint(ofunc)) } ## ----eval = TRUE, echo = FALSE, results = 'asis'------------------------------ printToc(4) ## ----------------------------------------------------------------------------- cpoScale # a cpo constructor ## ----------------------------------------------------------------------------- cpoAddCols ## ----------------------------------------------------------------------------- cpoScale(center = FALSE) # create a CPO object that scales, but does not center, data ## ----------------------------------------------------------------------------- cpoAddCols(Sepal.Area = Sepal.Length * Sepal.Width) # this would add a column ## ----------------------------------------------------------------------------- iris.demo = iris[c(1, 2, 3, 51, 52, 102, 103), ] tail(iris.demo %>>% cpoQuantileBinNumerics()) # bin the data in below & above median ## ----------------------------------------------------------------------------- # first create three quantile bins, then as.numeric() all columns to # get 1, 2 or 3 as the bin number quantilenum = cpoQuantileBinNumerics(numsplits = 3) %>>% cpoAsNumeric() iris.demo %>>% quantilenum ## ----------------------------------------------------------------------------- quantilenum.restricted = cpoQuantileBinNumerics(numsplits = 3) %>>% cpoAsNumeric(affect.names = "Species", affect.invert = TRUE) iris.demo %>>% quantilenum.restricted ## ----------------------------------------------------------------------------- demo.task = makeClassifTask(data = iris.demo, target = "Species") result = demo.task %>>% quantilenum getTaskData(result) ## ----------------------------------------------------------------------------- cpo = cpoScale() cpo ## ----------------------------------------------------------------------------- getHyperPars(cpo) # list of parameter names and values ## ----------------------------------------------------------------------------- getParamSet(cpo) # more detailed view of parameters and their type / range ## ----------------------------------------------------------------------------- !cpo # equivalent to print(cpo, verbose = TRUE) ## ----------------------------------------------------------------------------- cpo2 = setHyperPars(cpo, scale.scale = FALSE) cpo2 ## ----------------------------------------------------------------------------- iris.demo %>>% cpo # scales and centers ## ----------------------------------------------------------------------------- iris.demo %>>% cpo2 # only centers ## ----------------------------------------------------------------------------- cpo = cpoScale(id = "a") %>>% cpoScale(id = "b") # not very useful example getHyperPars(cpo) ## ----------------------------------------------------------------------------- cpo = cpoPca(export = c("center", "rank")) getParamSet(cpo) ## ----------------------------------------------------------------------------- transformed = iris.demo %>>% cpoPca(rank = 3) transformed ## ----------------------------------------------------------------------------- ret = retrafo(transformed) ret ## ----------------------------------------------------------------------------- iris.demo[1, ] %>>% ret ## ----------------------------------------------------------------------------- iris.demo[1, ] %>>% cpoPca(rank = 3) ## ----------------------------------------------------------------------------- t2 = transformed %>>% cpoScale() retrafo(t2) ## ----------------------------------------------------------------------------- t3 = clearRI(transformed) %>>% cpoScale() retrafo(t3) ## ----------------------------------------------------------------------------- all.equal(t2, t3, check.attributes = FALSE) ## ----------------------------------------------------------------------------- retrafo(transformed) %>>% retrafo(t3) # is the same as retrafo(t2) above. ## ----------------------------------------------------------------------------- iris.regr = makeRegrTask(data = iris.demo, target = "Petal.Width") iris.logd = iris.regr %>>% cpoLogTrafoRegr() getTaskData(iris.logd) # log-transformed target 'Petal.Width' ## ----------------------------------------------------------------------------- inv = inverter(iris.logd) # inverter object inv ## ----------------------------------------------------------------------------- logmodel = train("regr.lm", iris.logd) pred = predict(logmodel, iris.logd) # prediction on the task itself pred ## ----------------------------------------------------------------------------- invert(inv, pred) ## ----------------------------------------------------------------------------- newdata = makeRegrTask("newiris", iris[7:9, ], target = "Petal.Width", fixup.data = "no", check.data = FALSE) ## ----------------------------------------------------------------------------- # the retrafo does the same transformation(s) on newdata that were # done on the training data of the model, iris.logd. In general, this # could be more than just the target log transformation. newdata.transformed = newdata %>>% retrafo(iris.logd) getTaskData(newdata.transformed) ## ----------------------------------------------------------------------------- pred = predict(logmodel, newdata.transformed) pred ## ----------------------------------------------------------------------------- # the inverter of the newly transformed data contains information specific # to the newly transformed data. In the current case, that is just the # new "truth" column for the new data. inv.newdata = inverter(newdata.transformed) invert(inv.newdata, pred) ## ----------------------------------------------------------------------------- invert(retrafo(iris.logd), pred) ## ----------------------------------------------------------------------------- getCPOTrainedCapability(retrafo(iris.logd)) # can do both retrafo and inversion ## ----------------------------------------------------------------------------- getCPOTrainedCapability(inv) # a pure inverter, can not be used for retrafo ## ----warnings = FALSE--------------------------------------------------------- set.seed(123) # for reproducibility iris.resid = iris.regr %>>% cpoRegrResiduals("regr.lm") getTaskData(iris.resid) ## ----------------------------------------------------------------------------- model.resid = train("regr.randomForest", iris.resid) newdata.resid = newdata %>>% retrafo(iris.resid) getTaskData(newdata.resid) # Petal.Width are now the residuals of lm model predictions ## ----------------------------------------------------------------------------- pred = predict(model.resid, newdata.resid) pred ## ----------------------------------------------------------------------------- # transforming this prediction back to compare # it to the original 'Petal.Width' inv.newdata = inverter(newdata.resid) invert(inv.newdata, pred) ## ----------------------------------------------------------------------------- sampled = iris %>>% cpoSample(size = 3) sampled ## ----------------------------------------------------------------------------- retrafo(sampled) inverter(sampled) ## ----------------------------------------------------------------------------- set.seed(123) # for reproducibility lrn = cpoRegrResiduals("regr.lm") %>>% makeLearner("regr.randomForest") lrn ## ----warnings = FALSE--------------------------------------------------------- model = train(lrn, iris.regr) pred = predict(model, newdata) pred ## ----------------------------------------------------------------------------- retrafo(model) ## ----------------------------------------------------------------------------- icalrn = cpoIca() %>>% makeLearner("classif.logreg") getParamSet(icalrn) ## ----------------------------------------------------------------------------- ps = makeParamSet( makeIntegerParam("ica.n.comp", lower = 1, upper = 8), makeDiscreteParam("ica.alg.typ", values = c("parallel", "deflation"))) # shorter version using pSS: # ps = pSS(ica.n.comp: integer[1, 8], ica.alg.typ: discrete[parallel, deflation]) ## ----------------------------------------------------------------------------- tuneParams(icalrn, pid.task, cv5, par.set = ps, control = makeTuneControlGrid(), show.info = FALSE) ## ----------------------------------------------------------------------------- cpoAsNumeric # plain print !cpoAsNumeric # verbose print ## ----------------------------------------------------------------------------- cpoScale() %>>% cpoIca() # plain print !cpoScale() %>>% cpoIca() # verbose print ## ----------------------------------------------------------------------------- as.list(cpoScale() %>>% cpoIca()) ## ----------------------------------------------------------------------------- pipeCPO(list(cpoScale(), cpoIca())) ## ----------------------------------------------------------------------------- repca = retrafo(iris.demo %>>% cpoPca()) state = getCPOTrainedState(repca) state ## ----------------------------------------------------------------------------- state$control$center = FALSE state$control$scale = FALSE nosc.repca = makeCPOTrainedFromState(cpoPca, state) ## ----------------------------------------------------------------------------- iris.demo %>>% repca ## ----------------------------------------------------------------------------- iris.demo %>>% nosc.repca ## ----------------------------------------------------------------------------- NULLCPO ## ----------------------------------------------------------------------------- all.equal(iris %>>% NULLCPO, iris) cpoPca() %>>% NULLCPO ## ----------------------------------------------------------------------------- cpm = cpoMultiplex(list(cpoIca, cpoPca(export = "export.all"))) !cpm ## ----------------------------------------------------------------------------- iris.demo %>>% setHyperPars(cpm, selected.cpo = "ica", ica.n.comp = 3) ## ----------------------------------------------------------------------------- iris.demo %>>% setHyperPars(cpm, selected.cpo = "pca", pca.rank = 3) ## ----------------------------------------------------------------------------- cpa = cpoWrap() !cpa ## ----------------------------------------------------------------------------- iris.demo %>>% setHyperPars(cpa, wrap.cpo = cpoScale()) ## ----------------------------------------------------------------------------- iris.demo %>>% setHyperPars(cpa, wrap.cpo = cpoPca()) ## ----------------------------------------------------------------------------- getParamSet(cpoWrap() %>>% makeLearner("classif.logreg")) ## ----------------------------------------------------------------------------- scale = cpoSelect(pattern = "Sepal", id = "first") %>>% cpoScale(id = "scale") scale.pca = scale %>>% cpoPca() cbinder = cpoCbind(scale, scale.pca, cpoSelect(pattern = "Petal", id = "second")) ## ----------------------------------------------------------------------------- !cbinder ## ----------------------------------------------------------------------------- iris.demo %>>% cbinder