## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo = TRUE, eval=FALSE) options(rmarkdown.html_vignette.check_title = FALSE) data.table::setDTthreads(2) ## ----------------------------------------------------------------------------- # library(RaMS) # library(tidyverse) # library(microbenchmark) # library(MSnbase) # library(Spectra) # library(DBI) # library(arrow) # library(rvest) # library(xml2) # # BiocParallel::register(BiocParallel::SerialParam(stop.on.error = FALSE, progressbar = TRUE)) ## ----file download------------------------------------------------------------ # set.seed(123) # # n_ms_files <- 10 # base_url <- "ftp://massive.ucsd.edu/v01/MSV000080030/peak/" %>% # paste0("Forensic_study_80_volunteers/Forensic_Hands_mzXML/") %>% # paste0("Forensic_Hands_plate_1_mzXML/Sample/") # file_list <- base_url %>% # read_html %>% # xml_text() %>% # strsplit("\\\n") %>% # pluck(1) %>% # str_remove(".*2016 ") %>% # str_remove("\\\r") # chosen_file_list <- sample(file_list, n_ms_files) # # dir.create("vignettes/figures/ssc_vignette_renders/Sample/") # for(i in chosen_file_list){ # new_file_path <- paste0("vignettes/figures/ssc_vignette_renders/Sample/", i) # download.file(paste0(base_url, i), destfile = new_file_path, mode = "wb") # } # # ms_files <- list.files("vignettes/figures/ssc_vignette_renders/Sample/", full.names = TRUE) ## ----MSnExp------------------------------------------------------------------- # msnexp_obj <- readMSData(ms_files, mode="inMemory", msLevel. = 1) # plot(chromatogram(msnexp_obj, mz=pmppm(432.2810, ppm = 20))) ## ----onDiskMSnExp------------------------------------------------------------- # ondisk_obj <- readMSData(ms_files, mode="onDisk", msLevel. = 1) # plot(chromatogram(ondisk_obj, mz=pmppm(432.2810, ppm = 20))) ## ----Spectra------------------------------------------------------------------ # getIntensities <- function(x, ...) { # if (nrow(x)) { # cbind(mz = NA_real_, intensity = x[, "intensity"]) # } else cbind(mz = NA_real_, intensity = NA_real_) # } # # sfs_filtered <- Spectra(ms_files, source=MsBackendMzR()) %>% # filterMsLevel(1) %>% # filterMzRange(pmppm(432.2810, ppm = 20)) # sfs_agg <- addProcessing(sfs_filtered, getIntensities) # eic <- cbind(rt=rtime(sfs_agg), int=unlist(intensity(sfs_agg), use.names = FALSE)) # plot(eic[,"rt"], eic[,"int"], type="l") ## ----RaMS--------------------------------------------------------------------- # rams_obj <- grabMSdata(ms_files, grab_what="MS1") # rams_chrom_data <- rams_obj$MS1[mz%between%pmppm(432.2810, ppm = 20)] # plot(rams_chrom_data$rt, rams_chrom_data$int, type="l") ## ----tmzML-------------------------------------------------------------------- # tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/", gsub("mzXML", "tmzML", basename(ms_files))) # dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs") # bpmapply(tmzmlMaker, ms_files, tmzml_names, BPPARAM = SnowParam(workers = 3, progressbar = TRUE, tasks=length(tmzml_names))) # tmzml_obj <- grabMSdata(tmzml_names) # tmzml_chrom_data <- tmzml_obj$MS1[mz%between%pmppm(432.2810, ppm = 20)] # plot(tmzml_chrom_data$rt, tmzml_chrom_data$int, type="l") ## ----arrow-------------------------------------------------------------------- # write_dataset(rams_obj$MS1[order(mz)], path = "vignettes/figures/ssc_vignette_renders/pqds") # arrow_data <- open_dataset("vignettes/figures/ssc_vignette_renders/pqds") %>% # filter(mz%between%pmppm(432.2810, ppm = 20)) %>% # dplyr::collect() # plot(arrow_data$rt, arrow_data$int, type="l") ## ----sqlite database---------------------------------------------------------- # MSdb <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite") # dbWriteTable(MSdb, "MS1", rams_obj$MS1, overwrite=TRUE) # EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound' # query_params <- as.list(pmppm(432.2810, ppm = 20)) # names(query_params) <- c("lower_bound", "upper_bound") # sql_data <- dbGetQuery(MSdb, EIC_query, params = query_params) # plot(sql_data$rt, sql_data$int, type="l") # # dbClearResult(dbSendQuery(MSdb, "CREATE INDEX mz ON MS1 (mz)")) # sql_data <- dbGetQuery(MSdb, EIC_query, params = query_params) # sql_data <- sql_data[order(sql_data$filename, sql_data$rt),] # plot(sql_data$rt, sql_data$int, type="l") # dbDisconnect(MSdb) ## ----------------------------------------------------------------------------- # msduck <- dbConnect(duckdb::duckdb(), "vignettes/figures/ssc_vignette_renders/duckdb") # dbWriteTable(msduck, "MS1", rams_obj$MS1[order(mz)], overwrite=TRUE) # # EIC_query <- paste('SELECT * FROM MS1 WHERE mz BETWEEN', pmppm(432.2810, ppm = 20)[1], # 'AND', pmppm(432.2810, ppm = 20)[2]) # duck_data <- dbGetQuery(msduck, EIC_query) # duck_data <- duck_data[order(duck_data$filename, duck_data$rt),] # plot(duck_data$rt, duck_data$int, type="l") # dbDisconnect(msduck) ## ----time2make---------------------------------------------------------------- # msnexp_make_fun <- function(){ # readMSData(ms_files, mode="inMemory", msLevel. = 1) # } # ondisk_make_fun <- function(){ # readMSData(ms_files, mode="onDisk", msLevel. = 1) # } # spectra_make_fun <- function(){ # Spectra(ms_files, source=MsBackendMzR()) %>% filterMsLevel(1) # } # rams_make_fun <- function(){ # grabMSdata(ms_files, grab_what="MS1") # } # tmzml_make_fun <- function(){ # tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/", # gsub("mzXML", "tmzML", basename(ms_files))) # dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs") # mapply(tmzmlMaker, ms_files, tmzml_names) # unlink("vignettes/figures/ssc_vignette_renders/tmzMLs", recursive = TRUE) # } # arrow_make_fun <- function(){ # msdata <- grabMSdata(ms_files, grab_what="MS1") # write_dataset(msdata$MS1, path = "vignettes/figures/ssc_vignette_renders/pqds") # unlink("vignettes/figures/ssc_vignette_renders/pqds", recursive = TRUE) # } # sql_make_fun <- function(){ # msdata <- grabMSdata(ms_files, grab_what="MS1") # MSdb <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite") # dbWriteTable(MSdb, "MS1", msdata$MS1, overwrite=TRUE) # dbDisconnect(MSdb) # unlink("vignettes/figures/ssc_vignette_renders/MSdata.sqlite") # } # sqlidx_make_fun <- function(){ # msdata <- grabMSdata(ms_files, grab_what="MS1") # MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite") # dbWriteTable(MSdb_idx, "MS1", msdata$MS1, overwrite=TRUE) # rs <- dbSendQuery(MSdb_idx, "CREATE INDEX mz ON MS1 (mz)") # dbClearResult(rs) # dbDisconnect(MSdb_idx) # unlink("vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite") # } # duckdb_make_fun <- function(){ # msdata <- grabMSdata(ms_files, grab_what="MS1") # msduck <- dbConnect(duckdb::duckdb(), "vignettes/figures/ssc_vignette_renders/duckdb") # dbWriteTable(msduck, "MS1", rams_obj$MS1[order(mz)], overwrite=TRUE) # dbDisconnect(msduck) # unlink("vignettes/figures/ssc_vignette_renders/duckdb") # } # # make_timings <- microbenchmark( # msnexp_make_fun(), ondisk_make_fun(), spectra_make_fun(), rams_make_fun(), # tmzml_make_fun(), arrow_make_fun(), sql_make_fun(), sqlidx_make_fun(), # duckdb_make_fun(), # times = 10) # write_csv(make_timings, "vignettes/figures/ssc_vignette_renders/make_timings.csv") ## ----plot time2make----------------------------------------------------------- # make_timings <- read_csv("vignettes/figures/ssc_vignette_renders/make_timings.csv") # make_timings %>% # as.data.frame() %>% # arrange(expr) %>% # mutate(expr=str_remove(expr, "_make_fun\\(\\)")) %>% # mutate(rep_type=case_when( # expr%in%c("msnexp", "ondisk", "spectra", "rams")~"Every R session", # TRUE~"Single-time only" # )) %>% # mutate(expr=factor(expr, levels=c("msnexp", "ondisk", "spectra", "rams", # "tmzml", "arrow", "sql", "sqlidx", "duckdb"), # labels=c("MSnExp", "OnDiskMSnExp", "Spectra", "RaMS", # "tmzMLs", "Arrow", "SQL", "SQL (indexed)", "DuckDB"))) %>% # ggplot() + # geom_boxplot(aes(x=expr, y=time/1e9)) + # geom_hline(yintercept = 0) + # facet_wrap(~rep_type, nrow = 1, scales="free_x") + # labs(y="Seconds", x=NULL) + # theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) # ggsave("vignettes/figures/ssc_vignette_renders/make_time_gp.png", width = 6.5, height = 4, units = "in", device = "png", dpi = 144) ## ----parallel_proc------------------------------------------------------------ # unpar_rams <- function(){ # print("Unpar RaMS") # lapply(ms_files, grabMSdata) # } # unpar_tmzml <- function(){ # print("Unpar tmzML") # tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/", # gsub("mzXML", "tmzML", basename(ms_files))) # dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs") # pbapply::pbmapply(tmzmlMaker, ms_files, tmzml_names) # unlink("vignettes/figures/ssc_vignette_renders/tmzMLs", recursive = TRUE) # } # # library(BiocParallel) # par_param <- SnowParam(workers = 5, progressbar = TRUE) # par_rams <- function(){ # print("Par RaMS") # bplapply(ms_files, grabMSdata, BPPARAM = par_param) # } # par_tmzml <- function(){ # print("Par tmzML") # tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/", # gsub("mzXML", "tmzML", basename(ms_files))) # dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs") # bpmapply(tmzmlMaker, ms_files, tmzml_names, BPPARAM = par_param) # unlink("vignettes/figures/ssc_vignette_renders/tmzMLs", recursive = TRUE) # } # # par_timings <- microbenchmark(par_rams(), unpar_rams(), par_tmzml(), unpar_tmzml(), times = 5) # write_csv(par_timings, "vignettes/figures/ssc_vignette_renders/par_timings.csv") ## ----parallel proc plot------------------------------------------------------- # par_timings <- read_csv("vignettes/figures/ssc_vignette_renders/par_timings.csv") # par_timings %>% # as.data.frame() %>% # separate(expr, into = c("sub_type", "par_type"), sep = "_") %>% # mutate(par_type=str_remove(par_type, "\\(\\)")) %>% # mutate(sub_type=factor(sub_type, levels=c("unpar", "par"), # labels=c("Sequential", "Parallel"))) %>% # mutate(par_type=factor(par_type, levels=c("rams", "tmzml"), # labels=c("RaMS", "tmzMLs"))) %>% # ggplot() + # geom_boxplot(aes(x=par_type, y=time/1e9)) + # geom_hline(yintercept = 0) + # facet_wrap(~sub_type, nrow = 1) + # labs(y="Seconds", x=NULL) + # theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) # ggsave("vignettes/figures/ssc_vignette_renders/par_time_gp.png", width = 6.5, height = 3, units = "in", device = "png", dpi = 144) ## ----time2query--------------------------------------------------------------- # msnexp_obj <- readMSData(ms_files, mode="inMemory", msLevel. = 1) # ondisk_obj <- readMSData(ms_files, mode="onDisk", msLevel. = 1) # spectra_obj <- Spectra(ms_files, source=MsBackendMzR()) %>% filterMsLevel(1) # rams_obj <- grabMSdata(ms_files, grab_what="MS1") # # tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/", # gsub("mzXML", "tmzML", basename(ms_files))) # dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs") # mapply(tmzmlMaker, ms_files, tmzml_names) # # write_dataset(rams_obj$MS1[order(mz)], path = "vignettes/figures/ssc_vignette_renders/pqds") # # MSdb <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite") # dbWriteTable(MSdb, "MS1", rams_obj$MS1, overwrite=TRUE) # dbDisconnect(MSdb) # # MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite") # dbWriteTable(MSdb_idx, "MS1", rams_obj$MS1, overwrite=TRUE) # dbClearResult(dbSendQuery(MSdb_idx, "CREATE INDEX mz ON MS1 (mz)")) # dbDisconnect(MSdb_idx) # # msduck <- dbConnect(duckdb::duckdb(), "vignettes/figures/ssc_vignette_renders/duckdb") # dbWriteTable(msduck, "MS1", rams_obj$MS1[order(mz)], overwrite=TRUE) # dbDisconnect(msduck) # # msnexp_query_fun <- function(){ # plot(chromatogram(msnexp_obj, mz=pmppm(432.2810, ppm = 20))) # } # ondisk_query_fun <- function(){ # plot(chromatogram(ondisk_obj, mz=pmppm(432.2810, ppm = 20))) # } # spectra_query_fun <- function(){ # sfs_filtered <- spectra_obj %>% filterMzRange(pmppm(432.2810, ppm = 20)) # getIntensities <- function(x, ...) { # if (nrow(x)) { # cbind(mz = NA_real_, intensity = x[, "intensity"]) # } else cbind(mz = NA_real_, intensity = NA_real_) # } # sfs_agg <- addProcessing(sfs_filtered, getIntensities) # eic <- cbind(rt=rtime(sfs_agg), int=unlist(intensity(sfs_agg), use.names = FALSE)) # plot(eic[,"rt"], eic[,"int"], type="l") # } # rams_query_fun <- function(){ # rams_chrom_data <- rams_obj$MS1[mz%between%pmppm(432.2810, ppm = 20)] # plot(rams_chrom_data$rt, rams_chrom_data$int, type="l") # } # tmzml_query_fun <- function(){ # tmzml_names <- list.files("vignettes/figures/ssc_vignette_renders/tmzMLs", full.names = TRUE) # tmzml_obj <- grabMSdata(tmzml_names) # tmzml_chrom_data <- tmzml_obj$MS1[mz%between%pmppm(432.2810, ppm = 20)] # plot(tmzml_chrom_data$rt, tmzml_chrom_data$int, type="l") # } # arrow_query_fun <- function(){ # arrow_data <- open_dataset("vignettes/figures/ssc_vignette_renders/pqds") %>% # filter(mz%between%pmppm(432.2810, ppm = 20)) %>% # dplyr::collect() # arrow_data <- arrow_data[order(arrow_data$filename, arrow_data$rt),] # plot(arrow_data$rt, arrow_data$int, type="l") # } # sql_query_fun <- function(){ # MSdb <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite") # EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound' # query_params <- as.list(pmppm(432.2810, ppm = 20)) # names(query_params) <- c("lower_bound", "upper_bound") # sql_data <- dbGetQuery(MSdb, EIC_query, params = query_params) # plot(sql_data$rt, sql_data$int, type="l") # dbDisconnect(MSdb) # } # sqlidx_query_fun <- function(){ # MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite") # EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound' # query_params <- as.list(pmppm(432.2810, ppm = 20)) # names(query_params) <- c("lower_bound", "upper_bound") # sql_data <- dbGetQuery(MSdb_idx, EIC_query, params = query_params) # sql_data <- sql_data[order(sql_data$filename, sql_data$rt),] # plot(sql_data$rt, sql_data$int, type="l") # dbDisconnect(MSdb_idx) # } # duckdb_query_fun <- function(){ # msduck <- dbConnect(duckdb::duckdb(), "vignettes/figures/ssc_vignette_renders/duckdb") # EIC_query <- paste('SELECT * FROM MS1 WHERE mz BETWEEN', pmppm(432.2810, ppm = 20)[1], # 'AND', pmppm(432.2810, ppm = 20)[2]) # duck_data <- dbGetQuery(msduck, EIC_query) # duck_data <- duck_data[order(duck_data$filename, duck_data$rt),] # plot(duck_data$rt, duck_data$int, type="l") # dbDisconnect(msduck) # } # # Could be optimized by doing the ordering within the query # # query_timings <- microbenchmark( # msnexp_query_fun(), ondisk_query_fun(), spectra_query_fun(), rams_query_fun(), # tmzml_query_fun(), arrow_query_fun(), sql_query_fun(), sqlidx_query_fun(), # duckdb_query_fun(), # times = 10 # ) # query_timings # write_csv(query_timings, "vignettes/figures/ssc_vignette_renders/query_timings.csv") ## ----time2query plot---------------------------------------------------------- # query_timings <- read_csv("vignettes/figures/ssc_vignette_renders/query_timings.csv") # query_timings %>% # as.data.frame() %>% # arrange(expr) %>% # mutate(expr=str_remove(expr, "_query_fun\\(\\)")) %>% # mutate(rep_type=case_when( # expr%in%c("msnexp", "ondisk", "spectra", "rams")~"Every R session", # TRUE~"Single-time only" # )) %>% # mutate(expr=factor(expr, levels=c("msnexp", "ondisk", "spectra", "rams", # "tmzml", "arrow", "sql", "sqlidx", "duckdb"), # labels=c("MSnExp", "OnDiskMSnExp", "Spectra", "RaMS", # "tmzMLs", "Arrow", "SQL", "SQL (indexed)", "DuckDB"))) %>% # ggplot() + # geom_boxplot(aes(x=expr, y=time/1e9)) + # scale_y_log10() + # labs(y="Seconds", x=NULL) + # theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) # ggsave("vignettes/figures/ssc_vignette_renders/query_time_gp.png", width = 6.5, height = 5, units = "in", device = "png", dpi = 144) ## ----double-check sql_idx vs RaMS--------------------------------------------- # MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite") # EIC_query <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound' # query_params <- as.list(pmppm(432.2810, ppm = 20)) # names(query_params) <- c("lower_bound", "upper_bound") # dbGetQuery(MSdb_idx, EIC_query, params = query_params) %>% qplotMS1data() # # rams_obj$MS1[mz%between%pmppm(432.2810, ppm = 20)] %>% qplotMS1data() ## ----make query strings------------------------------------------------------- # rams_obj <- grabMSdata(ms_files, grab_what="MS1") # grouped_ms1 <- rams_obj$MS1 %>% # arrange(desc(int)) %>% # mutate(mz_group=mz_group(mz, ppm = 10, max_groups = 10, min_group_size=20)) %>% # drop_na() # # grouped_ms1 %>% # # qplotMS1data(facet_col="mz_group", facet_args = list(ncol=2)) # mzs_to_grab <- grouped_ms1 %>% # group_by(mz_group) %>% # summarise(mean_mz=mean(mz), sd_mz=sd(mz), mean_rt=mean(rt)) %>% # pull(mean_mz) # # rams_arrow_call <- lapply(mzs_to_grab, function(mz_i){ # mzrange <- pmppm(mz_i, 10) # call("between", as.name("mz"), mzrange[[1]], mzrange[[2]]) # }) %>% paste(collapse = "|") # # sql_comb_call <- sapply(mzs_to_grab, function(mz_i){ # paste("mz BETWEEN", pmppm(mz_i, 10)[1], "AND", pmppm(mz_i, 10)[2]) # }) %>% paste(collapse = " OR ") %>% paste("SELECT * FROM MS1 WHERE", .) # # print(rams_arrow_call) # print(sql_comb_call) ## ----multichrom query timing-------------------------------------------------- # rams_uni_fun <- function(){ # print("RaMS unified") # rams_obj$MS1[eval(parse(text=rams_arrow_call))] # } # rams_loop_fun <- function(){ # print("RaMS loop") # lapply(mzs_to_grab, function(mz_i){ # rams_obj$MS1[mz%between%pmppm(mz_i, 10)] # }) %>% bind_rows() %>% distinct() # } # # arrow_ds <- open_dataset("vignettes/figures/ssc_vignette_renders/pqds") # arrow_uni_fun <- function(){ # print("Arrow unified") # arrow_ds %>% # filter(eval(parse(text = rams_arrow_call))) %>% # collect() # } # arrow_loop_fun <- function(){ # print("Arrow loop") # lapply(mzs_to_grab, function(mz_i){ # arrow_ds %>% # filter(mz%between%pmppm(mz_i, 10)) %>% # collect() # }) %>% bind_rows() %>% distinct() # } # # MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite") # sql_uni_fun <- function(){ # print("SQL unified") # dbGetQuery(MSdb_idx, sql_comb_call) # } # sql_query_base <- 'SELECT * FROM MS1 WHERE mz BETWEEN :lower_bound AND :upper_bound' # sql_loop_fun <- function(){ # print("SQL loop") # lapply(mzs_to_grab, function(mz_i){ # query_params <- as.list(pmppm(mz_i, ppm = 20)) # names(query_params) <- c("lower_bound", "upper_bound") # sql_data <- dbGetQuery(MSdb_idx, sql_query_base, params = query_params) # }) %>% bind_rows() %>% distinct() # } # # msduck <- dbConnect(duckdb::duckdb(), "vignettes/figures/ssc_vignette_renders/duckdb") # duckdb_uni_fun <- function(){ # print("DuckDB unified") # dbGetQuery(msduck, sql_comb_call) # } # duckdb_loop_fun <- function(){ # print("DuckDB loop") # lapply(mzs_to_grab, function(mz_i){ # EIC_query <- paste('SELECT * FROM MS1 WHERE mz BETWEEN', pmppm(mz_i, ppm = 20)[1], # 'AND', pmppm(mz_i, ppm = 20)[2]) # sql_data <- dbGetQuery(msduck, EIC_query) # }) %>% bind_rows() %>% distinct() # } # # # multichrom_timings <- microbenchmark( # rams_uni_fun(), rams_loop_fun(), arrow_uni_fun(), arrow_loop_fun(), # sql_uni_fun(), sql_loop_fun(), duckdb_uni_fun(), duckdb_loop_fun(), # times = 10 # ) # write_csv(multichrom_timings, "vignettes/figures/ssc_vignette_renders/multichrom_timings.csv") ## ----multichrom query plot---------------------------------------------------- # multichrom_timings <- read_csv("vignettes/figures/ssc_vignette_renders/multichrom_timings.csv") # multichrom_timings %>% # as.data.frame() %>% # arrange(expr) %>% # mutate(expr=str_remove(expr, "_fun\\(\\)")) %>% # separate(expr, into = c("expr", "query_type"), sep = "_") %>% # mutate(expr=factor(expr, levels=c("rams", "arrow", "sql", "duckdb"), # labels=c("RaMS", "Arrow", "SQL", "DuckDB"))) %>% # mutate(query_type=factor(query_type, levels=c("uni", "loop"), # labels=c("Unified query", "Loop"))) %>% # ggplot() + # geom_boxplot(aes(x=query_type, y=time/1e9), lwd=1) + # facet_wrap(~expr, nrow=1) + # scale_y_log10() + # labs(y="Seconds", x=NULL, color=NULL) + # theme_bw() # ggsave("vignettes/figures/ssc_vignette_renders/multichrom_gp.png", width = 6.5, height = 4, units = "in", device = "png", dpi = 144) ## ----sizing info-------------------------------------------------------------- # size_list <- list() # # size_list$mzXML <- sum(file.size(ms_files)) # # msnexp_obj <- readMSData(ms_files, mode="inMemory", msLevel. = 1) # size_list$msnexp_obj <- pryr::object_size(msnexp_obj) # rm(msnexp_obj) # # ondisk_obj <- readMSData(ms_files, mode="onDisk", msLevel. = 1) # size_list$ondisk_obj <- pryr::object_size(ondisk_obj) # rm(ondisk_obj) # # sfs_filtered <- Spectra(ms_files, source=MsBackendMzR()) %>% # filterMsLevel(1) # size_list$spectra <- pryr::object_size(sfs_filtered) # rm(sfs_filtered) # # rams_obj <- grabMSdata(ms_files, grab_what="MS1") # size_list$rams <- pryr::object_size(rams_obj) # # tmzml_names <- paste0(dirname(dirname(ms_files)), "/tmzMLs/", gsub("mzXML", "tmzML", basename(ms_files))) # dir.create("vignettes/figures/ssc_vignette_renders/tmzMLs") # bpmapply(tmzmlMaker, ms_files, tmzml_names, BPPARAM = SnowParam(workers = 5, progressbar = TRUE, tasks=length(tmzml_names))) # size_list$tmzml <- sum(file.size(list.files("vignettes/figures/ssc_vignette_renders/tmzMLs", full.names = TRUE))) # unlink("vignettes/figures/ssc_vignette_renders/tmzMLs", recursive = TRUE) # # write_dataset(rams_obj$MS1, path = "vignettes/figures/ssc_vignette_renders/pqds") # size_list$arrow <- sum(file.size(list.files("vignettes/figures/ssc_vignette_renders/pqds", full.names = TRUE))) # unlink("vignettes/figures/ssc_vignette_renders/pqds", recursive = TRUE) # # MSdb <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite") # dbWriteTable(MSdb, "MS1", rams_obj$MS1, overwrite=TRUE) # dbDisconnect(MSdb) # size_list$MSdb <- file.size("vignettes/figures/ssc_vignette_renders/MSdata.sqlite") # # # MSdb_idx <- dbConnect(RSQLite::SQLite(), "vignettes/figures/ssc_vignette_renders/MSdata.sqlite") # rs <- dbSendQuery(MSdb_idx, "CREATE INDEX mz ON MS1 (mz)") # dbClearResult(rs) # dbDisconnect(MSdb_idx) # size_list$MSdb_idx <- file.size("vignettes/figures/ssc_vignette_renders/MSdata.sqlite") # unlink("vignettes/figures/ssc_vignette_renders/MSdata.sqlite") # # msduck <- dbConnect(duckdb::duckdb(), "vignettes/figures/ssc_vignette_renders/duckdb") # dbWriteTable(msduck, "MS1", rams_obj$MS1[order(mz)], overwrite=TRUE) # size_list$msduck <- file.size("vignettes/figures/ssc_vignette_renders/duckdb") # dbDisconnect(msduck) # unlink("vignettes/figures/ssc_vignette_renders/duckdb") # # size_list %>% # unlist() %>% # data.frame(bytes=.) %>% # rownames_to_column("expr") %>% # write_csv("vignettes/figures/ssc_vignette_renders/size_list.csv") ## ----plot size info----------------------------------------------------------- # size_df <- read_csv("vignettes/figures/ssc_vignette_renders/size_list.csv") # size_df %>% # filter(expr!="mzXML") %>% # mutate(expr=str_remove(expr, "_obj")) %>% # mutate(expr=str_replace(expr, "MSdb_?", "sql")) %>% # mutate(mem_type=case_when( # expr%in%c("msnexp", "ondisk", "spectra", "rams")~"Memory", # TRUE~"Disk" # )) %>% # mutate(mem_type=factor(mem_type, levels=c("Memory", "Disk"))) %>% # mutate(expr=factor(expr, levels=c("msnexp", "ondisk", "spectra", "rams", # "tmzml", "arrow", "sql", "sqlidx", "duckdb"), # labels=c("MSnExp", "OnDisk\nMSnExp", "Spectra", "RaMS", # "tmzMLs", "Arrow", "SQL", "SQL\n(indexed)", "DuckDB"))) %>% # ggplot() + # geom_hline(yintercept = size_df$bytes[size_df$expr=="mzXML"]/(1024^3)) + # geom_point(aes(x=expr, y=bytes/(1024^3))) + # scale_y_log10(breaks=c(0.001, 0.01, 0.1, 1, 10), labels=c("1MB", "10MB", "100MB", "1GB", "10GB"), # limits=c(0.001, 10)) + # facet_grid(~mem_type, scales = "free_x", space = "free_x") + # labs(x=NULL, y=NULL) # ggsave("vignettes/figures/ssc_vignette_renders/size_cons.png", width = 6.5, height = 4, units = "in", device = "png", dpi = 144) ## ----summary plot------------------------------------------------------------- # make_timings <- read_csv("vignettes/figures/ssc_vignette_renders/make_timings.csv") # query_timings <- read_csv("vignettes/figures/ssc_vignette_renders/query_timings.csv") # size_df <- read_csv("vignettes/figures/ssc_vignette_renders/size_list.csv") %>% # mutate(expr=str_remove(expr, "_obj")) %>% # mutate(expr=str_replace(expr, "MSdb_?", "sql")) %>% # mutate(expr=ifelse(expr=="msduck", "duckdb", expr)) # # bind_rows(make_timings, query_timings) %>% # as.data.frame() %>% # group_by(expr) %>% # mutate(time=time/1e9) %>% # summarise(med_time=median(time), IQR_time=IQR(time)) %>% # mutate(expr=str_remove(expr, "_fun\\(\\)")) %>% # separate(expr, into = c("expr", "time_type")) %>% # left_join(size_df) %>% # mutate(rep_type=case_when( # expr%in%c("msnexp", "ondisk", "spectra", "rams")~"Memory", # TRUE~"Disk" # )) %>% # mutate(expr=factor(expr, levels=c("msnexp", "ondisk", "spectra", "rams", # "tmzml", "arrow", "sql", "sqlidx", "duckdb"), # labels=c("MSnExp", "OnDiskMSnExp", "Spectra", "RaMS", # "tmzMLs", "Arrow", "SQL", "SQL (indexed)", "DuckDB"))) %>% # pivot_wider(names_from = time_type, values_from=c("med_time", "IQR_time")) %>% # ggplot() + # geom_vline(xintercept = 0) + # geom_linerange(aes(x=med_time_make, ymin=med_time_query-IQR_time_query*2, # ymax=med_time_query+IQR_time_query*2, color=expr)) + # geom_linerange(aes(y=med_time_query, xmin=med_time_make-IQR_time_make*2, # xmax=med_time_make+IQR_time_make*2, color=expr)) + # geom_point(aes(x=med_time_make, y=med_time_query, color=expr, # size=bytes, shape=rep_type)) + # scale_y_log10() + # coord_flip() + # guides(color = guide_legend(order = 1), shape = guide_legend(order = 2), # size=guide_legend(order=3)) + # labs(x="Time to transform (s)", y="Time to query (s)", color=NULL, size="Size (GB)", # shape="Storage") + # theme_bw() # ggsave("vignettes/figures/ssc_vignette_renders/sum_plot.png", width = 6.5, height = 5.5, units = "in", device = "png", dpi = 144) # # # # bind_rows(make_timings, query_timings) %>% # # as.data.frame() %>% # # group_by(expr) %>% # # mutate(time=time/1e9) %>% # # summarise(med_time=median(time)) %>% # # mutate(expr=str_remove(expr, "_fun\\(\\)")) %>% # # separate(expr, into = c("expr", "time_type")) %>% # # pivot_wider(names_from = time_type, values_from=med_time) %>% # # left_join(size_df) %>% # # plotly::plot_ly(x=~make, y=~query, z=~size, hovertext=~expr, # # type="scatter3d", mode="markers") # # # bind_rows(make_timings, query_timings) %>% # as.data.frame() %>% # group_by(expr) %>% # mutate(time=time/1e9) %>% # summarise(med_time=median(time), IQR_time=IQR(time)) %>% # mutate(expr=str_remove(expr, "_fun\\(\\)")) %>% # separate(expr, into = c("expr", "time_type")) %>% # left_join(size_df, by = join_by(expr)) %>% # mutate(rep_type=case_when( # expr%in%c("msnexp", "ondisk", "spectra", "rams")~"Memory", # TRUE~"Disk" # )) %>% # mutate(expr=factor(expr, levels=c("msnexp", "ondisk", "spectra", "rams", # "tmzml", "arrow", "sql", "sqlidx", "duckdb"), # labels=c("MSnExp", "OnDiskMSnExp", "Spectra", "RaMS", # "tmzMLs", "Arrow", "SQL", "SQL (indexed)", "DuckDB"))) %>% # pivot_wider(names_from = time_type, values_from=c("med_time", "IQR_time")) %>% # ggplot() + # geom_linerange(aes(x=bytes, ymin=med_time_query-IQR_time_query, # ymax=med_time_query+IQR_time_query, color=expr), linewidth=1) + # geom_point(aes(x=bytes, y=med_time_query, color=expr), size=3) + # scale_y_log10() + # guides(color = guide_legend(order = 1)) + # labs(x="Size (GB)", y="Time to query (s)") + # theme_bw() # # # query_timings %>% # # as.data.frame() %>% # # group_by(expr) %>% # # mutate(time=time/1e9) %>% # # mutate(expr=str_remove(expr, "_query.*")) %>% # # left_join(size_df) %>% # # mutate(rep_type=case_when( # # expr%in%c("msnexp", "ondisk", "spectra", "rams")~"Memory", # # TRUE~"Disk" # # )) %>% # # mutate(expr=factor(expr, levels=c("msnexp", "ondisk", "spectra", "rams", # # "tmzml", "arrow", "sql", "sqlidx", "duckdb"), # # labels=c("MSnExp", "OnDiskMSnExp", "Spectra", "RaMS", # # "tmzMLs", "Arrow", "SQL", "SQL (indexed)", "DuckDB"))) %>% # # ggplot() + # # geom_point(aes(x=size, y=time, fill=expr), size=3, pch=21, alpha=0.7) + # # scale_y_log10() + # # guides(color = guide_legend(order = 1)) + # # labs(x="Size (GB)", y="Time to query (s)") + # # theme_bw() ## ----cleanup------------------------------------------------------------------ # # unlink("vignettes/figures/ssc_vignette_renders/tmzMLs", recursive = TRUE) # # unlink("vignettes/figures/ssc_vignette_renders/pqds", recursive = TRUE) # # unlink("vignettes/figures/ssc_vignette_renders/MSdata.sqlite") # # unlink("vignettes/figures/ssc_vignette_renders/MSdata_idx.sqlite") # # unlink("vignettes/figures/ssc_vignette_renders/duckdb") # # unlink("vignettes/figures/ssc_vignette_renders/Sample/", recursive = TRUE)