## ----libraryChunk, load-packages, include=FALSE------------------------------- # markdown packages library(rgnparser) library(magrittr) library(knitr) library(rmarkdown) library(rmdformats) library(prettydoc) library(htmltools) library(pkgdown) # Load core packages library(devtools) library(BiocManager) library(purrr) library(here) library(renv) library(bdc) library(CoordinateCleaner) library(dplyr) library(readr) library(stringr) library(lubridate) library(tidyselect) library(R.utils) library(tidyr) library(ggplot2) library(forcats) library(emld) library(rlang) library(xml2) library(mgsub) library(rvest) library(rnaturalearth) library(rnaturalearthdata) library(countrycode) library(janitor) library(circlize) library(paletteer) library(cowplot) library(igraph) library(ggspatial) library(sf) library(parallel) library(terra) # Dont detect cores to avoid GitHbub error old <- options() # code line i on.exit(options(old)) # code line i+1 options(mc.cores = parallel::detectCores()) ## ----secretRootPath, include=FALSE-------------------------------------------- # Set the RootPath to tempdir RootPath <- tempdir() if (!dir.exists(paste0(RootPath, "/Data_acquisition_workflow"))) { dir.create(paste0(RootPath, "/Data_acquisition_workflow"), recursive = TRUE) } ## ----global-options, include=FALSE-------------------------------------------- knitr::opts_chunk$set(error = TRUE, eval = TRUE, tidy = TRUE, warning = FALSE, root.dir = normalizePath(tempdir())) ## ----falseRootPath, eval=FALSE------------------------------------------------ # RootPath <- paste0("/your/path/here") ## ----CreateRootPath, warning=FALSE, collapse = TRUE--------------------------- # Create the working directory in the RootPath if it doesn't exist already if (!dir.exists(paste0(RootPath, "/Data_acquisition_workflow"))) { dir.create(paste0(RootPath, "/Data_acquisition_workflow"), recursive = TRUE) } # Set the working directory setwd(paste0(RootPath,"/Data_acquisition_workflow")) ## ----activate, collapse = TRUE------------------------------------------------ renv::activate(project = paste0(RootPath,"/Data_acquisition_workflow")) ## ----installPackages, message=FALSE, warning=FALSE, results=FALSE, collapse = TRUE, eval = FALSE---- # if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager", repos = "http://cran.us.r-project.org") # # BiocManager::install("ComplexHeatmap") ## ----rnaturalearthhires, eval=FALSE------------------------------------------- # # Install remotes if needed # if (!require("remotes", quietly = TRUE)) # install.packages("remotes", repos = "http://cran.us.r-project.org") # # Download and then load rnaturalearthhires # remotes::install_github("ropensci/rnaturalearthhires") # install.packages("rnaturalearthhires", repos = "https://ropensci.r-universe.dev", type = "source") # library(rnaturalearthhires) ## ----installBeeBDC, results=TRUE, message=TRUE, eval = FALSE, collapse = TRUE---- # install.packages("BeeBDC") # library(BeeBDC) ## ----snapshot, collapse = TRUE------------------------------------------------ renv::snapshot(project = paste0(RootPath,"/Data_acquisition_workflow"), prompt = FALSE) ## ----dirMaker, collapse = TRUE, eval = FALSE---------------------------------- # BeeBDC::dirMaker( # RootPath = RootPath, # RDoc = "vignettes/BeeBDC_main.Rmd") %>% # # Add paths created by this function to the environment() # list2env(envir = parent.env(environment())) ## ----dirMakerSECRETELY, include = FALSE--------------------------------------- # For the sake of this tutorial, we will not use here::i_am in dirMaker, because we aren't allowed # to mess with package directories in this way. This will work-around to use the tempdir() DataPath <- paste0(RootPath, "/Data_acquisition_workflow") OutPath_Check <- paste0(RootPath, "/Data_acquisition_workflow/Output/Check") OutPath_Figures <- paste0(RootPath, "/Data_acquisition_workflow/Output/Figures") OutPath_Intermediate <- paste0(RootPath, "/Data_acquisition_workflow/Output/Intermediate") OutPath_Report <- paste0(RootPath, "/Data_acquisition_workflow/Output/Report") # Create these files if (!dir.exists(DataPath)) { dir.create(DataPath, recursive = TRUE)} if (!dir.exists(OutPath_Check)) { dir.create(OutPath_Check, recursive = TRUE)} if (!dir.exists(OutPath_Figures)) { dir.create(OutPath_Figures, recursive = TRUE)} if (!dir.exists(OutPath_Intermediate)) { dir.create(OutPath_Intermediate, recursive = TRUE)} if (!dir.exists(OutPath_Report)) { dir.create(OutPath_Report, recursive = TRUE)} ## ----lapply_library, results=FALSE, collapse = TRUE--------------------------- lapply(c("ComplexHeatmap", "magrittr"), library, character.only = TRUE) ## ----3.0, collapse = TRUE----------------------------------------------------- data("bees3sp", package = "BeeBDC") data("beesRaw", package = "BeeBDC") db_standardized <- dplyr::bind_rows(beesRaw, # Only keep a subset of columns from bees3sp bees3sp %>% dplyr::select(tidyselect::all_of(colnames(beesRaw)), countryCode)) ## ----3.1, collapse = TRUE----------------------------------------------------- check_pf <- bdc::bdc_scientificName_empty( data = db_standardized, sci_name = "scientificName") # now that this is saved, remove it to save space in memory rm(db_standardized) ## ----3.2, collapse = TRUE----------------------------------------------------- check_pf <- bdc::bdc_coordinates_empty( data = check_pf, lat = "decimalLatitude", lon = "decimalLongitude") ## ----3.3, collapse = TRUE----------------------------------------------------- check_pf <- bdc::bdc_coordinates_outOfRange( data = check_pf, lat = "decimalLatitude", lon = "decimalLongitude") ## ----3.4, collapse = TRUE----------------------------------------------------- check_pf <- bdc::bdc_basisOfRecords_notStandard( data = check_pf, basisOfRecord = "basisOfRecord", names_to_keep = c( # Keep all plus some at the bottom. "Event", "HUMAN_OBSERVATION", "HumanObservation", "LIVING_SPECIMEN", "LivingSpecimen", "MACHINE_OBSERVATION", "MachineObservation", "MATERIAL_SAMPLE", "O", "Occurrence", "MaterialSample", "OBSERVATION", "Preserved Specimen", "PRESERVED_SPECIMEN", "preservedspecimen Specimen", "Preservedspecimen", "PreservedSpecimen", "preservedspecimen", "S", "Specimen", "Taxon", "UNKNOWN", "", NA, "NA", "LITERATURE", "None", "Pinned Specimen", "Voucher reared", "Emerged specimen" )) ## ----3.5a, collapse = TRUE---------------------------------------------------- check_pf_noNa <- BeeBDC::countryNameCleanR( data = check_pf, # Create a Tibble of common issues in country names and their replacements commonProblems = dplyr::tibble(problem = c('U.S.A.', 'US','USA','usa','UNITED STATES', 'United States','U.S.A','MX','CA','Bras.','Braz.', 'Brasil','CNMI','USA TERRITORY: PUERTO RICO'), fix = c('United States of America','United States of America', 'United States of America','United States of America', 'United States of America','United States of America', 'United States of America','Mexico','Canada','Brazil', 'Brazil','Brazil','Northern Mariana Islands','PUERTO.RICO')) ) ## ----3.5b, message=FALSE, warning=FALSE, collapse = TRUE---------------------- suppressWarnings( countryOutput <- BeeBDC::jbd_CfC_chunker(data = check_pf_noNa, lat = "decimalLatitude", lon = "decimalLongitude", country = "country", # How many rows to process at a time stepSize = 1000000, # Start row chunkStart = 1, path = OutPath_Intermediate, # Normally, please use scale = "large" scale = "medium", mc.cores = 1), classes = "warning") ## ----3.5ci, collapse = TRUE--------------------------------------------------- check_pf <- dplyr::left_join(check_pf, countryOutput, by = "database_id", suffix = c("", "CO")) %>% # Take the new country name if the original is NA dplyr::mutate(country = dplyr::if_else(is.na(country), countryCO, country)) %>% # Remove duplicates if they arose from left_join! dplyr::distinct() ## ----3.5cii, eval = FALSE, collapse = TRUE------------------------------------ # check_pf %>% # readr::write_excel_csv(., # paste(OutPath_Intermediate, "01_prefilter_database.csv", # sep = "/")) ## ----3.5ciii, eval = FALSE, collapse = TRUE----------------------------------- # if(!exists("check_pf")){ # check_pf <- readr::read_csv(paste(DataPath, # "Output", "Intermediate", "01_prefilter_database.csv", sep = "/"), # col_types = BeeBDC::ColTypeR())} ## ----3.5civ, collapse = TRUE-------------------------------------------------- rm(check_pf_noNa, countryOutput) ## ----3.6, collapse = TRUE----------------------------------------------------- # Standardise country names and add ISO2 codes if needed check_pf <- bdc::bdc_country_standardized( # Remove the countryCode and country_suggested columns to avoid an error with # where two "countryCode" and "country_suggested" columns exist (i.e. if the dataset has been # run before) data = check_pf %>% dplyr::select(!tidyselect::any_of(c("countryCode", "country_suggested"))), country = "country" ) ## ----3.7, message=FALSE, warning=FALSE, collapse = TRUE----------------------- check_pf <- BeeBDC::jbd_Ctrans_chunker( # bdc_coordinates_transposed inputs data = check_pf, id = "database_id", lat = "decimalLatitude", lon = "decimalLongitude", country = "country", countryCode = "countryCode", border_buffer = 0.2, # in decimal degrees (~22 km at the equator) save_outputs = TRUE, sci_names = "scientificName", # chunker inputs stepSize = 1000000, # How many rows to process at a time chunkStart = 1, # Start row append = FALSE, # If FALSE it may overwrite existing dataset progressiveSave = FALSE, # In a normal run, please use scale = "large" scale = "medium", path = OutPath_Check, mc.cores = 1 ) ## ----3.7ii, eval = FALSE, collapse = TRUE------------------------------------- # table(check_pf$coordinates_transposed, useNA = "always") ## ----3.7iii, eval = FALSE, collapse = TRUE------------------------------------ # check_pf %>% # readr::write_excel_csv(., # paste(OutPath_Intermediate, "01_prefilter_database.csv", # sep = "/")) ## ----3.7iv, eval = FALSE, collapse = TRUE------------------------------------- # if(!exists("check_pf")){ # check_pf <- readr::read_csv(paste(OutPath_Intermediate, "01_prefilter_database.csv", # sep = "/"), col_types = BeeBDC::ColTypeR())} ## ----3.8, collapse = TRUE----------------------------------------------------- check_pf <- BeeBDC::jbd_coordCountryInconsistent( data = check_pf, lon = "decimalLongitude", lat = "decimalLatitude", scale = 50, pointBuffer = 0.01) ## ----3.8ii, eval = FALSE------------------------------------------------------ # check_pf %>% # readr::write_excel_csv(., # paste(OutPath_Intermediate, "01_prefilter_database.csv", # sep = "/")) ## ----3.9, eval = TRUE, collapse = TRUE---------------------------------------- xyFromLocality <- bdc::bdc_coordinates_from_locality( data = check_pf, locality = "locality", lon = "decimalLongitude", lat = "decimalLatitude", save_outputs = FALSE ) ## ----3.9ii, eval = FALSE------------------------------------------------------ # # Save the resultant data # xyFromLocality %>% readr::write_excel_csv(paste(OutPath_Check, "01_coordinates_from_locality.csv", # sep = "/")) ## ----3.9iii, eval = FALSE----------------------------------------------------- # rm(xyFromLocality) ## ----3.10, collapse = TRUE---------------------------------------------------- check_pf <- BeeBDC::flagAbsent(data = check_pf, PresAbs = "occurrenceStatus") ## ----3.11, collapse = TRUE---------------------------------------------------- check_pf <- BeeBDC::flagLicense(data = check_pf, strings_to_restrict = "all", # DON'T flag if in the following dataSource(s) excludeDataSource = NULL) ## ----3.12, collapse = TRUE---------------------------------------------------- check_pf <- BeeBDC::GBIFissues(data = check_pf, issueColumn = "issue", GBIFflags = c("COORDINATE_INVALID", "ZERO_COORDINATE")) ## ----3.13a, eval = FALSE------------------------------------------------------ # flagFile <- BeeBDC::flagRecorder( # data = check_pf, # outPath = paste(OutPath_Report, sep =""), # fileName = paste0("flagsRecorded_", Sys.Date(), ".csv"), # # These are the columns that will be kept along with the flags # idColumns = c("database_id", "id", "catalogNumber", "occurrenceID", "dataSource"), # # TRUE if you want to find a file from a previous part of the script to append to # append = FALSE) # ## ----3.13b, collapse = TRUE--------------------------------------------------- check_pf <- BeeBDC::summaryFun( data = check_pf, # Don't filter these columns (or NULL) dontFilterThese = NULL, # Remove the filtering columns? removeFilterColumns = FALSE, # Filter to ONLY cleaned data? filterClean = FALSE) ## ----3.13c, eval = FALSE------------------------------------------------------ # (report <- bdc::bdc_create_report(data = check_pf, # database_id = "database_id", # workflow_step = "prefilter", # save_report = TRUE) # ) ## ----3.14, eval = FALSE------------------------------------------------------- # check_pf %>% # readr::write_excel_csv(., paste(OutPath_Intermediate, "01_prefilter_output.csv", # sep = "/")) ## ----4.0, collapse = TRUE----------------------------------------------------- if(!exists("check_pf")){ database <- readr::read_csv( paste(OutPath_Intermediate, "01_prefilter_output.csv", sep = "/"), col_types = BeeBDC::ColTypeR()) }else{ # OR rename and remove database <- check_pf # Remove spent dataset rm(check_pf)} ## ----4.0ii, collapse = TRUE--------------------------------------------------- database <- database %>% dplyr::select(!tidyselect::any_of("names_clean")) ## ----4.1, eval = FALSE, collapse = TRUE--------------------------------------- # parse_names <- # bdc::bdc_clean_names(sci_names = database$scientificName, save_outputs = FALSE) ## ----4.1ii, collapse = TRUE, eval = FALSE------------------------------------- # parse_names <- # parse_names %>% # dplyr::select(.uncer_terms, names_clean) ## ----4.1iii, collapse = TRUE-------------------------------------------------- database <- dplyr::bind_cols(database) rm(parse_names) ## ----4.2, collapse = TRUE, eval = FALSE--------------------------------------- # taxonomyFile <- BeeBDC::beesTaxonomy() ## ----4.2secret, collapse=TRUE, include=FALSE---------------------------------- # load in the small test dataset in the background system.file("extdata", "testTaxonomy.rda", package="BeeBDC") |> load() # Rename the file taxonomyFile <- testTaxonomy rm(testTaxonomy) ## ----4.2ii, collapse = TRUE--------------------------------------------------- database <- BeeBDC::harmoniseR(path = DataPath, #The path to a folder that the output can be saved taxonomy = taxonomyFile, # The formatted taxonomy file data = database, mc.cores = 1) ## ----4.2iii, collapse = TRUE-------------------------------------------------- rm(taxonomyFile) ## ----4.2iv, eval = FALSE, collapse = TRUE------------------------------------- # database %>% # readr::write_excel_csv(., # paste(DataPath, "Output", "Intermediate", "02_taxonomy_database.csv", # sep = "/")) ## ----4.3, eval = FALSE, collapse = TRUE--------------------------------------- # flagFile <- BeeBDC::flagRecorder( # data = database, # outPath = paste(OutPath_Report, sep =""), # fileName = paste0("flagsRecorded_", Sys.Date(), ".csv"), # idColumns = c("database_id", "id", "catalogNumber", "occurrenceID", "dataSource"), # append = TRUE, # printSummary = TRUE) ## ----5.0, collapse = TRUE----------------------------------------------------- if(!exists("database")){ database <- readr::read_csv(paste(OutPath_Intermediate, "02_taxonomy_database.csv", sep = "/"), col_types = BeeBDC::ColTypeR())} ## ----5.1, collapse = TRUE----------------------------------------------------- check_space <- BeeBDC::jbd_coordinates_precision( data = database, lon = "decimalLongitude", lat = "decimalLatitude", ndec = 2 # number of decimals to be tested ) ## ----5.1ii, collapse = TRUE--------------------------------------------------- rm(database) ## ----5.1iii, eval = FALSE, collapse = TRUE------------------------------------ # check_space %>% # readr::write_excel_csv(., # paste(OutPath_Intermediate, "03_space_inter_database.csv", # sep = "/")) ## ----5.2, eval = FALSE, collapse = TRUE--------------------------------------- # tempSpace <- check_space %>% # dplyr::filter(!.coordinates_empty == FALSE) %>% # dplyr::filter(!.coordinates_outOfRange == FALSE) ## ----5.2ii, message=TRUE, warning=FALSE, eval = FALSE, collapse = TRUE-------- # tempSpace <- # CoordinateCleaner::clean_coordinates( # x = tempSpace, # lon = "decimalLongitude", # lat = "decimalLatitude", # species = "scientificName", # countries = NULL, # Tests if coords are from x country. This is not needed. # tests = c( # "capitals", # records within 0.5 km of capitals centroids # "centroids", # records within 1 km around country and province centroids # "equal", # records with equal coordinates # "gbif", # records within 1 km of GBIF headquarters. (says 1 degree in package, but code says 1000 m) # "institutions", # records within 100m of zoo and herbaria # "zeros" # records with coordinates 0,0 # # "seas" # Not flagged as this should be flagged by coordinate country inconsistent # ), # capitals_rad = 1000, # centroids_rad = 500, # centroids_detail = "both", # test both country and province centroids # inst_rad = 100, # remove zoo and herbaria within 100m # range_rad = 0, # zeros_rad = 0.5, # capitals_ref = NULL, # centroids_ref = NULL, # country_ref = NULL, # country_refcol = "countryCode", # inst_ref = NULL, # range_ref = NULL, # # seas_scale = 50, # value = "spatialvalid" # result of tests are appended in separate columns # ) %>% # # Remove duplicate .summary column that can be replaced later and turn into a tibble # dplyr::select(!tidyselect::starts_with(".summary")) %>% # dplyr::tibble() ## ----5.2iii, eval = FALSE, collapse = TRUE------------------------------------ # check_space <- tempSpace %>% # # Re-bind with the records that were removed earlier # dplyr::bind_rows(check_space %>% # dplyr::filter(.coordinates_empty == FALSE | # .coordinates_outOfRange == FALSE) ) ## ----5.2iv, eval = FALSE, collapse = TRUE------------------------------------- # rm(tempSpace) ## ----5.2v, eval = FALSE, collapse = TRUE-------------------------------------- # check_space %>% # readr::write_excel_csv(paste(OutPath_Intermediate, "03_space_inter_database.csv", # sep = "/")) ## ----5.3, collapse = TRUE----------------------------------------------------- check_space <- BeeBDC::diagonAlley( data = check_space, # The minimum number of repeats needed to find a sequence in for flagging minRepeats = 6, ndec = 3, groupingColumns = c("eventDate", "recordedBy", "datasetName"), mc.cores = 1) ## ----5.3ii, collapse = TRUE--------------------------------------------------- griddingDF <- check_space %>% # Exclude NA lat and lon values tidyr::drop_na(c("decimalLatitude", "decimalLongitude")) %>% # Group by the dataset name dplyr::group_by(datasetName) %>% # Remove rows that aren't unique for lat and long dplyr::distinct(decimalLongitude, decimalLatitude, .keep_all = TRUE) %>% # Find the groups with 4 or more occurrence records dplyr::filter(dplyr::n() >= 4) %>% dplyr::ungroup() ## ----5.3iii, eval = FALSE, collapse = TRUE------------------------------------ # gridded_datasets <- CoordinateCleaner::cd_round( # x = griddingDF, # lon = "decimalLongitude", # lat = "decimalLatitude", # ds = "datasetName", # T1 = 7, # min_unique_ds_size = 4, # test = "both", # value = "dataset", # graphs = FALSE, # verbose = TRUE, # reg_out_thresh = 2, # reg_dist_min = 0.1, # reg_dist_max = 2 # ) %>% # dplyr::tibble() # # The griddingDF is no longer needed. remove it. # rm(griddingDF) ## ----5.3iv, eval = FALSE, collapse = TRUE------------------------------------- # check_space <- check_space %>% # # Join the datasets # dplyr::left_join( # # Select the columns of interest # dplyr::select(gridded_datasets, dataset, lon.flag, lat.flag, summary), # by = c("datasetName" = "dataset")) %>% # # Make new columns with more-consistent naming and change the NA vlaues to = TRUE (not flagged) # dplyr::mutate(.lonFlag = tidyr::replace_na(lon.flag, TRUE), # .latFlag = tidyr::replace_na(lat.flag, TRUE), # .gridSummary = tidyr::replace_na(summary, TRUE)) %>% # # Remove old columns # dplyr::select(!c(lon.flag, lat.flag, summary)) ## ----5.3 v, eval = FALSE, collapse = TRUE------------------------------------- # gridded_datasets %>% # readr::write_excel_csv(paste(OutPath_Intermediate, "03_space_griddedDatasets.csv", # sep = "/")) ## ----5.3vi, eval = FALSE, collapse = TRUE------------------------------------- # rm(gridded_datasets) ## ----5.4, collapse = TRUE----------------------------------------------------- check_space <- BeeBDC::coordUncerFlagR(data = check_space, uncerColumn = "coordinateUncertaintyInMeters", threshold = 1000) ## ----5.5, collapse = TRUE, eval = FALSE--------------------------------------- # checklistFile <- BeeBDC::beesChecklist() ## ----5.5secret, collapse = TRUE, eval = TRUE---------------------------------- # load in the small test dataset in the background system.file("extdata", "testChecklist.rda", package="BeeBDC") |> load() # Rename the file taxonomyFile <- testChecklist rm(testChecklist) ## ----5.5iia, collapse = TRUE-------------------------------------------------- check_space <- BeeBDC::countryOutlieRs(checklist = checklistFile, data = check_space, keepAdjacentCountry = TRUE, pointBuffer = 0.05, # Scale of map to return, one of 110, 50, 10 OR 'small', 'medium', 'large' # Smaller numbers will result in much longer calculation times. # We have not attempted a scale of 10. scale = 50, mc.cores = 1) ## ----5.5iib, collapse = TRUE-------------------------------------------------- check_space <- BeeBDC::continentOutlieRs(checklist = checklistFile, data = check_space, keepAdjacentContinent = FALSE, pointBuffer = 0.05, # Scale of map to return, one of 110, 50, 10 OR 'small', 'medium', 'large' # Smaller numbers will result in much longer calculation times. # We have not attempted a scale of 10. scale = 50, mc.cores = 1) ## ----5.5iii, eval = FALSE, collapse = TRUE------------------------------------ # # A list of failed species-country combinations and their numbers can be output here # check_space %>% # dplyr::filter(.countryOutlier == FALSE) %>% # dplyr::select(database_id, scientificName, country) %>% # dplyr::group_by(scientificName) %>% # dplyr::mutate(count_scientificName = n()) %>% # dplyr::distinct(scientificName, country, .keep_all = TRUE) %>% # readr::write_excel_csv(paste(OutPath_Intermediate, "03_space_failedCountryChecklist.csv", # sep = "/")) ## ----5.6, eval = FALSE, collapse = TRUE--------------------------------------- # check_space <- BeeBDC::summaryFun( # data = check_space, # dontFilterThese = NULL, # removeFilterColumns = FALSE, # filterClean = FALSE) ## ----5.6ii, eval = FALSE, collapse = TRUE------------------------------------- # check_space %>% # dplyr::filter(.summary == FALSE) %>% # map only records flagged as FALSE # bdc::bdc_quickmap( # data = ., # lon = "decimalLongitude", # lat = "decimalLatitude", # col_to_map = ".summary", # size = 0.9 # ) # ## ----5.7, eval = FALSE, collapse = TRUE--------------------------------------- # (report <- # bdc::bdc_create_report( # data = dplyr::tibble(check_space %>% dplyr::select(!.uncer_terms)), # database_id = "database_id", # workflow_step = "space", # save_report = TRUE) # ) ## ----5.8, eval = FALSE, collapse = TRUE--------------------------------------- # (figures <- # BeeBDC::jbd_create_figures( # data = dplyr::tibble(check_space %>% dplyr::select(!.uncer_terms)), # path = DataPath, # database_id = "database_id", # workflow_step = "space", # save_figures = TRUE) # ) ## ----5.8ii, eval = FALSE, collapse = TRUE------------------------------------- # check_space %>% # readr::write_excel_csv(paste(OutPath_Intermediate, "03_space_inter_database.csv", # sep = "/")) ## ----5.9, eval = FALSE, collapse = TRUE--------------------------------------- # BeeBDC::flagRecorder( # data = check_space, # outPath = paste(OutPath_Report, sep =""), # fileName = paste0("flagsRecorded_", Sys.Date(), ".csv"), # idColumns = c("database_id", "id", "catalogNumber", "occurrenceID", "dataSource"), # append = TRUE, # printSummary = TRUE) ## ----5.10, eval = FALSE, collapse = TRUE-------------------------------------- # check_space %>% # readr::write_excel_csv(., # paste(OutPath_Intermediate, "03_space_database.csv", # sep = "/")) ## ----6.0, collapse = TRUE----------------------------------------------------- if(!exists("check_space")){ check_time <- readr::read_csv(paste(OutPath_Intermediate, "03_space_database.csv", sep = "/"), col_types = BeeBDC::ColTypeR()) }else{ check_time <- check_space # Remove the spent file rm(check_space)} ## ----6.0ii, collapse = TRUE--------------------------------------------------- hist(lubridate::ymd_hms(check_time$eventDate, truncated = 5), breaks = 20, main = "Histogram of eventDates") ## ----6.0iii, collapse = TRUE-------------------------------------------------- check_time$year <- ifelse(check_time$year > lubridate::year(Sys.Date()) | check_time$year < 1600, NA, check_time$year) check_time$month <- ifelse(check_time$month > 12 | check_time$month < 1, NA, check_time$month) check_time$day <- ifelse(check_time$day > 31 | check_time$day < 1, NA, check_time$day) ## ----6.1, collapse = TRUE----------------------------------------------------- check_time <- BeeBDC::dateFindR(data = check_time, # Years above this are removed (from the recovered dates only) maxYear = lubridate::year(Sys.Date()), # Years below this are removed (from the recovered dates only) minYear = 1700) ## ----6.2, collapse = TRUE----------------------------------------------------- check_time <- bdc::bdc_eventDate_empty(data = check_time, eventDate = "eventDate") ## ----6.3, collapse = TRUE----------------------------------------------------- check_time <- bdc::bdc_year_outOfRange(data = check_time, eventDate = "year", year_threshold = 1950) ## ----6.4, eval = TRUE, collapse = TRUE---------------------------------------- check_time <- BeeBDC::summaryFun( data = check_time, # Don't filter these columns (or NULL) dontFilterThese = c(".gridSummary", ".lonFlag", ".latFlag", ".uncer_terms"), # Remove the filtering columns? removeFilterColumns = FALSE, # Filter to ONLY cleaned data? filterClean = FALSE) ## ----6.4ii, eval = FALSE, collapse = TRUE------------------------------------- # ( report <- # bdc::bdc_create_report(data = check_time, # database_id = "database_id", # workflow_step = "time", # save_report = FALSE) # ) ## ----6.5, eval = FALSE, collapse = TRUE--------------------------------------- # figures <- # BeeBDC::jbd_create_figures(data = check_time, # path = DataPath, # database_id = "database_id", # workflow_step = "time", # save_figures = TRUE) ## ----6.5ii, eval = FALSE, collapse = TRUE------------------------------------- # figures$year ## ----6.5iii, eval = FALSE, collapse = TRUE------------------------------------ # check_time %>% # readr::write_excel_csv(., # paste(OutPath_Intermediate, "04_time_database.csv", # sep = "/")) ## ----eval = FALSE, collapse = TRUE-------------------------------------------- # BeeBDC::flagRecorder( # data = check_time, # outPath = paste(OutPath_Report, sep =""), # fileName = paste0("flagsRecorded_", Sys.Date(), ".csv"), # idColumns = c("database_id", "id", "catalogNumber", "occurrenceID", "dataSource"), # append = TRUE, # printSummary = TRUE) # ## ----7.0, eval = FALSE, collapse = TRUE--------------------------------------- # if(!exists("check_time")){ # check_time <- # readr::read_csv(paste(OutPath_Intermediate, "04_time_database.csv", # sep = "/"), # col_types = BeeBDC::ColTypeR())} ## ----7.1, collapse = TRUE----------------------------------------------------- check_time <- BeeBDC::dupeSummary( data = check_time, path = OutPath_Report, # options are "ID","collectionInfo", or "both" duplicatedBy = "collectionInfo", # The columns to generate completeness info from (and to sort by completness) completeness_cols = c("decimalLatitude", "decimalLongitude", "scientificName", "eventDate"), # The columns to ADDITIONALLY consider when finding duplicates in collectionInfo collectionCols = c("decimalLatitude", "decimalLongitude", "scientificName", "eventDate", "recordedBy"), # The columns to combine, one-by-one with the collectionCols collectInfoColumns = c("catalogNumber", "otherCatalogNumbers"), # Custom comparisons — as a list of columns to compare # RAW custom comparisons do not use the character and number thresholds CustomComparisonsRAW = dplyr::lst(c("catalogNumber", "institutionCode", "scientificName")), # Other custom comparisons use the character and number thresholds CustomComparisons = dplyr::lst(c("gbifID", "scientificName"), c("occurrenceID", "scientificName"), c("recordId", "scientificName"), c("id", "scientificName")), # The order in which you want to KEEP duplicated based on data source # try unique(check_time$dataSource) sourceOrder = c("CAES", "Gai", "Ecd","BMont", "BMin", "EPEL", "ASP", "KP", "EcoS", "EaCO", "FSCA", "Bal", "SMC", "Lic", "Arm", "USGS", "ALA", "VicWam", "GBIF","SCAN","iDigBio"), # Paige ordering is done using the database_id prefix, not the dataSource prefix. prefixOrder = c("Paige", "Dorey"), # Set the complexity threshold for id letter and number length # minimum number of characters when WITH the numberThreshold characterThreshold = 2, # minimum number of numbers when WITH the characterThreshold numberThreshold = 3, # Minimum number of numbers WITHOUT any characters numberOnlyThreshold = 5 ) %>% # END dupeSummary dplyr::as_tibble(col_types = BeeBDC::ColTypeR()) ## ----7.1ii, eval = FALSE, collapse = TRUE------------------------------------- # check_time %>% # readr::write_excel_csv(., # paste(OutPath_Intermediate, "04_2_dup_database.csv", # sep = "/")) ## ----7.2, eval = FALSE, collapse = TRUE--------------------------------------- # BeeBDC::flagRecorder( # data = check_time, # outPath = paste(OutPath_Report, sep =""), # fileName = paste0("flagsRecorded_", Sys.Date(), ".csv"), # idColumns = c("database_id", "id", "catalogNumber", "occurrenceID", "dataSource"), # append = TRUE, # printSummary = TRUE) ## ----8.0, eval = FALSE, collapse = TRUE--------------------------------------- # if(!exists("check_time")){ # check_time <- # readr::read_csv(paste(OutPath_Intermediate, "04_2_dup_database.csv", # sep = "/"), col_types = ColTypeR())} ## ----8.1, eval = TRUE, collapse = TRUE---------------------------------------- if(!exists("duplicates")){ duplicates <- BeeBDC::fileFinder(path = DataPath, fileName = "duplicateRun_") %>% readr::read_csv()} ## ----8.2, eval = TRUE, collapse = TRUE---------------------------------------- # Make sure that the .summary column is updated check_time <- BeeBDC::summaryFun( data = check_time, dontFilterThese = c(".gridSummary", ".lonFlag", ".latFlag", ".uncer_terms", ".uncertaintyThreshold"), removeFilterColumns = FALSE, filterClean = FALSE) ## ----8.2ii, eval = FALSE, collapse = TRUE------------------------------------- # # Save the uncleaned dataset # check_time %>% readr::write_excel_csv(., # paste(OutPath_Intermediate, "05_unCleaned_database.csv", # sep = "/")) ## ----8.3, eval = TRUE, collapse = TRUE---------------------------------------- cleanData <- BeeBDC::summaryFun( data = check_time, dontFilterThese = c(".gridSummary", ".lonFlag", ".latFlag", ".uncer_terms", ".uncertaintyThreshold"), # Remove the filtering columns? removeFilterColumns = TRUE, # Filter to ONLY cleaned data? filterClean = TRUE) ## ----8.3ii, eval = FALSE, collapse = TRUE------------------------------------- # # Save this CLEANED dataset # cleanData %>% readr::write_excel_csv(., # paste(OutPath_Intermediate, "05_cleaned_database.csv", # sep = "/")) ## ----9.1, message=FALSE, warning=FALSE, eval = FALSE, collapse = TRUE--------- # if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager", repos = "http://cran.us.r-project.org") # BiocManager::install("ComplexHeatmap") ## ----9.1ii, eval = TRUE, collapse = TRUE-------------------------------------- if(!exists("duplicates")){ duplicates <- BeeBDC::fileFinder(path = DataPath, fileName = "duplicateRun_") %>% readr::read_csv()} ## ----9.1on.exit, include = FALSE---------------------------------------------- oldpar <- par(no.readonly = TRUE) on.exit(oldpar) ## ----9.1iii, eval = FALSE, collapse = TRUE------------------------------------ # par(mar = c(2, 2, 2, 2)/2, mfrow = c(1,1)) ## ----9.1iv, eval=FALSE, fig.fullwidth=TRUE, fig.height=7.5, fig.width=9, collapse = TRUE---- # BeeBDC::chordDiagramR( # # The duplicate data from the dupeSummary function output # dupeData = duplicates, # outPath = OutPath_Figures, # fileName = "ChordDiagram.pdf", # # These can be modified to help fit the final pdf that's exported. # width = 9, # height = 7.5, # bg = "white", # # How few distinct dataSources should a group have to be listed as "other" # smallGrpThreshold = 3, # title = "Duplicated record sources", # # The default list of colour palettes to choose from usign the paleteer package # palettes = c("cartography::blue.pal", "cartography::green.pal", # "cartography::sand.pal", "cartography::orange.pal", "cartography::red.pal", # "cartography::purple.pal", "cartography::brown.pal"), # canvas.ylim = c(-1.0,1.0), # canvas.xlim = c(-0.6, 0.25), # text.col = "black", # legendX = grid::unit(6, "mm"), # legendY = grid::unit(18, "mm"), # legendJustify = c("left", "bottom"), # niceFacing = TRUE) ## ----9.2, eval = TRUE, collapse = TRUE---------------------------------------- if(!exists("check_time")){ beeData <- readr::read_csv(paste(OutPath_Intermediate, "05_unCleaned_database.csv", sep = "/"), col_types = BeeBDC::ColTypeR()) }else{ beeData <- check_time rm(check_time) } ## ----9.2ii, warning=FALSE, eval=TRUE, collapse = TRUE------------------------- BeeBDC::dupePlotR( data = beeData, # The outPath to save the plot as outPath = OutPath_Figures, fileName = "duplicatePlot.pdf", # Colours in order: duplicate, kept duplicate, unique dupeColours = c("#F2D2A2","#B9D6BC", "#349B90"), # Plot size and height base_height = 7, base_width = 7, legend.position = c(0.85, 0.8), # Extra variables can be fed into forcats::fct_recode() to change names on plot GBIF = "GBIF", SCAN = "SCAN", iDigBio = "iDigBio", USGS = "USGS", ALA = "ALA", ASP = "ASP", CAES = "CAES", Ecd = "Ecd", returnPlot = TRUE ) ## ----9.3, fig.width=15, fig.height=9, fig.fullwidth=TRUE, eval=TRUE, collapse = TRUE---- BeeBDC::plotFlagSummary( data = beeData, # Colours in order of pass (TRUE), fail (FALSE), and NA flagColours = c("#127852", "#A7002D", "#BDBABB"), fileName = paste0("FlagsPlot_", Sys.Date(),".pdf"), outPath = paste0(OutPath_Figures), width = 15, height = 9, # OPTIONAL: # # Filter to a single species # speciesName = "Holcopasites heliopsis", # # column to look in # nameColumn = "species", # # Save the filtered data # saveFiltered = TRUE, # # Filter column to display on map # filterColumn = ".summary", # plotMap = TRUE, # # amount to jitter points if desired, e.g. 0.25 or NULL # jitterValue = NULL, # # Map opacity value for points between 0 and 1 # mapAlpha = 1, # # If a user wants to output the table used to make the figure, change this to TRUE # saveTable = FALSE, # Extra variables can be fed into forcats::fct_recode() to change names on plot GBIF = "GBIF", SCAN = "SCAN", iDigBio = "iDigBio", USGS = "USGS", ALA = "ALA", ASP = "ASP", CAES = "CAES", 'BMont' = "BMont", 'BMin' = "BMin", Ecd = "Ecd", Gaiarsa = "Gai", EPEL = "EPEL", VicWam = "VicWam", returnPlot = TRUE ) ## ----9.4, eval = TRUE, collapse = TRUE---------------------------------------- if(!exists("cleanData")){ cleanData <- readr::read_csv(paste(OutPath_Intermediate, "05_cleaned_database.csv", sep = "/"), col_types = BeeBDC::ColTypeR())} ## ----9.4a, eval=FALSE, collapse = TRUE---------------------------------------- # BeeBDC::summaryMaps( # data = cleanData, # width = 10, height = 10, # class_n = 3, # class_Style = "fisher", # fileName = "CountryMaps_fisher.pdf", # outPath = OutPath_Figures, # returnPlot = TRUE # ) ## ----9.4b, eval = FALSE, collapse = TRUE-------------------------------------- # BeeBDC::interactiveMapR( # # occurrence data # data = beeData, # # Directory where to save files # outPath = paste0(OutPath_Figures, "interactiveMaps", sep = "/"), # lon = "decimalLongitude", # lat = "decimalLatitude", # # Occurrence dataset column with species names # speciesColumn = "scientificName", # # Which species to map — a character vector of names or "ALL" # # Note: "ALL" is defined AFTER filtering for country # speciesList = "ALL", # countryList = NULL, # study area # # Point jitter to see stacked points — jitters an amount in decimal degrees # jitterValue = 0.01 # ) ## ----9.5, eval = FALSE, collapse = TRUE--------------------------------------- # if(!exists("cleanData")){ # cleanData <- readr::read_csv(paste(OutPath_Intermediate, "05_cleaned_database.csv", # sep = "/"), # col_types = BeeBDC::ColTypeR(), # locale = readr::locale(encoding = "UTF-8"))} ## ----9.5ii, eval = TRUE, collapse = TRUE-------------------------------------- # Note, if outPath = NULL then no file will be saved dataProvTable <- BeeBDC::dataProvTables(data = cleanData, runBeeDataChecks = TRUE, outPath = NULL, fileName = "dataProvTable.csv") ## ----9.6, eval = TRUE, collapse = TRUE---------------------------------------- # Note, if outPath = NULL then no file will be saved summaryTable <- BeeBDC::flagSummaryTable(data = beeData, column = "scientificName", outPath = NULL, fileName = "flagTable.csv") ## ----cleanup, include=FALSE, collapse = TRUE---------------------------------- # Remove the webpage folder unlink(paste0(dirname(getwd()), "/inst/extdata/WebDir"), recursive = TRUE)