globalVariables(".data")

#' Read and Validate BEQI2 Input Files
#'
#' This function reads and checks BEQI2 input files. The format has been
#' specified in Van Loon (2013) and is described in the vignette of the
#' BENMMI-package.
#'
#' @param filename name of BEQI2 input file (\code{character})
#'
#' @details The function performs the following tasks:
#' \itemize{
#'      \item{checks the existence of \code{filename};}
#'  	\item{checks availablitity of required columns (case insensitive);}
#'      \item{make column names with aggregation data case-insensitive;}
#'  	\item{removes redundant spaces;}
#'      \item{checks if DATE-field adheres to ISO 8601 (YYYY-mm-dd);}
#'  	\item{constructs a unique identifier \code{ID} by concatenating
#'          columns \code{OBJECTID} and \code{DATE};}
#'      \item{checks that each \code{ID} has a unique \code{AREA};}
#'      \item{checks azoic samples for VALUE=0;}
#'      \item{removes records with VALUE=0, not belonging to azoic samples;}
#'      \item{checks VALUE-field on missing values;}
#'      \item{checks if VALUE-field is an integer;}
#'  }
#'
#' @references Willem van Loon, 2013. BEQI2 INPUT FORMAT. See the
#'   package-vignette of the BENMMI-package.
#'
#' @importFrom utils read.csv
#'
#' @export
read_beqi2 <- function(filename) {

  # check if BEQI2-file exists
  if (!file.exists(filename)) {
    stop(sprintf("File %s not found", sQuote(filename)), call. = FALSE)
  }

  # read BEQI2-file
  d <- read.csv(file = filename, as.is = TRUE, na.strings = c("NA", ""))

  # validation
  validate_beqi2(d)
}



#' @describeIn read_beqi2 validator for BEQI2-format
#' @param x table in BEQI2-format
#'
#' @importFrom dplyr distinct
#' @importFrom tibble as_tibble
validate_beqi2 <- function(x) {

  # convert to tibble
  x <- x |>
    as_tibble()

  # CHAR is not required anymore: patch for backward-compatibility
  if (!("CHAR" %in% names(x))) {
    x$CHAR <- NA_character_
  }

  # check column names (case insensitive)
  required_vars <- c("OBJECTID", "HABITAT", "SAMPLEID", "TAXON",
                     "CHAR", "SAMPDEV", "AREA", "DATE", "VALUE")
  missing_vars <- setdiff(required_vars, toupper(names(x)))
  if (length(missing_vars) > 0L) {
    stop(sprintf("The following columns are missing: %s",
                 toString(missing_vars)), call. = FALSE)
  }

  # check required columns on missing values
  for (required_var in required_vars |> setdiff(c("CHAR", "SAMPDEV"))) {
    is_missing <- is.na(x[[required_var]])
    if (any(is_missing)) {
      index <- which(is_missing)
      stop(
        sprintf(
          paste0(
            "The value in column %s is missing for %i records.\n",
            "The record indices are:\n%s"
          ),
          sQuote(required_var),
          length(index),
          toString(index)
        ),
        call. = FALSE
      )
    }
  }

  # check VALUE field on non-negative values
  is_negative <- x$VALUE < 0
  if (any(is_negative)) {
    stop(
      sprintf(
        paste0(
          "Negative values found in column %s.\n",
          "The record indices are: %s"
        ),
        sQuote("VALUE"),
        toString(which(is_negative))
      ),
      call. = FALSE
    )
  }

  # check VALUE field on non-integer values
  is_integer <- (x$VALUE - floor(x$VALUE)) < .Machine$double.eps
  if (any(!is_integer)) {
    warning(
      sprintf(
        paste0("%s out of %s non-integer values found in column %s.\n",
               "This is about %s%% of the records.\n",
               "Please make sure that these records contain species ",
               "abundances (and not species densities)."),
        sum(!is_integer),
        length(is_integer),
        sQuote("VALUE"),
        round(100 * sum(!is_integer) / length(is_integer))
      ),
      call. = FALSE
    )
  }

  # remove redundant spaces
  x[required_vars] <- x[required_vars] |>
    lapply(FUN = strip_spaces) |>
    as_tibble()

  # harmonize columns (same case)
  x[required_vars] <- x[required_vars] |>
    lapply("harmonize") |>
    as_tibble()

  # coerce date to class 'Date'
  res <- try(as.Date(x$DATE, format = "%Y-%m-%d"), silent = TRUE)
  if (inherits(res, "try-error") || any(is.na(res))) {
    stop(
      "Invalid date formats found. Please adhere to ISO 8601 (YYYY-mm-dd)",
      call. = FALSE
    )
  }
  x$DATE <- res

  # check if areas are unique for a specific ID
  n <- x |>
    group_by(.data$OBJECTID, .data$SAMPLEID, .data$DATE) |>
    summarise(n = length(unique(.data$AREA)), .groups = "drop") |>
    filter(.data$n != 1L)
  if (nrow(n) != 0L) {
    stop(
      sprintf(
        paste0(
          "AREA is not unique for %i samples. ",
          "These samples have the following OBJECTID/SAMPLEID/DATE:\n%s"
        ),
        nrow(n),
        n |>
          select("OBJECTID", "SAMPLEID", "DATE") |>
          apply(MARGIN = 1, paste, collapse = "/") |>
          paste(collapse = "; ")
      ),
      call. = FALSE
    )
  }

  # find duplicated records (all required columns, except for VALUE)
  d <- x[, required_vars |> setdiff(c("VALUE", "CHAR", "SAMPDEV"))]
  if (anyDuplicated(d)) {
    stop(
      sprintf(
        "Duplicated rows found in the benthos file.\nThe row numbers are:\n%s",
        toString(sort(which(duplicated(d) | duplicated(d, fromLast = TRUE))))
      ),
      call. = FALSE
    )
  }

  # handle azoic samples
  index <- which(is_azoic(x$TAXON))
  if (length(index) != 0L) {
    if (isTRUE(any(x$VALUE[index] != 0L))) {
      warning(
        paste(
          "Azoic sample(s) found with non-zero VALUE-field.",
          "These abundances will be set to zero"
        ),
        call. = FALSE
      )
    }
    x$VALUE[index] <- 0L
  }

  # remove non-Azoic records with zero counts as these are redundant
  # and won't affect the results
  is_zero <- (x$VALUE == 0L) & !is_azoic(x$TAXON)
  if (any(is_zero)) {
    warning(
      sprintf(
        paste(
          "Non-azoic records (n=%s) found with zero VALUE-field.",
          "These records are redundant and will be excluded."
        ),
        sum(is_zero)
      ),
      call. = FALSE
    )
    x <- x[!is_zero, ]
  }

  # final checks
  if (nrow(x) == 0L) {
    stop("No valid records found", call. = FALSE)
  }
  x <- x |>
    distinct()

  # return result
  x
}



#' Read and Validate Taxa Waterbeheer Nederland (TWN) Data
#'
#' This function reads files in the Taxa Waterbeheer Nederland (TWN) format.
#'
#' @details The function adds a new column \code{taxon}. Its contents depending
#'     on TWN-status:
#' \describe{
#'   \item{status = 10}{taxonname}
#'   \item{status = 20}{prefername}
#'   \item{status = 80}{parentname}
#' }
#'
#' @param filename name of TWN file (\code{character})
#'
#' @return a \code{tibble} with four columns:
#' \describe{
#'   \item{GROUP}{TWN/WoRMS taxon group}
#'   \item{LEVEL}{TWN/WoRMS taxon level}
#'   \item{FROM}{taxon name to convert from}
#'   \item{TO}{taxon name to convert to}
#' }
#'
#' @references \url{https://taxainfo.nl/}
#'
#' @importFrom utils read.csv
#'
#' @export
read_twn <- function(filename) {

  # raise error if filename is missing
  if (missing(filename)) {
    stop("Filename is missing", call. = FALSE)
  }

  # check if file exists
  if (!file.exists(filename)) {
    stop(sprintf("File %s not found", sQuote(filename)), call. = FALSE)
  }

  # read file
  d <- try(
    read.csv(file = filename, as.is = TRUE, na.strings = c("NA", "")),
    silent = TRUE
  )
  if (inherits(d, "try-error")) {
    stop(
      sprintf("Errors occurred while reading %s", sQuote(filename)),
      call. = FALSE
    )
  }

  # validate TWN-file
  validate_twn(d)
}



#' @describeIn read_twn get default WoRMS list (TWN list extended with species
#'   Southern North Sea)
#' @export
get_worms <- function() {
  .Deprecated("get_taxa")
}



#' @describeIn read_twn validator for TWN-format
#' @param x table in TWN-format
#'
#' @importFrom dplyr across arrange bind_rows distinct group_by filter mutate
#'             n select summarise
#' @importFrom tidyselect all_of
#' @importFrom tibble as_tibble
validate_twn <- function(x) {

  # convert to tibble
  x <- x |>
    as_tibble()

  # check column names
  required_vars <- c("status", "taxonname", "taxongroup", "prefername",
                     "parentname", "taxonlevel")
  missing_vars <- setdiff(required_vars, tolower(names(x)))
  if (length(missing_vars) > 0L) {
    stop(sprintf("The following columns are missing: %s",
                 toString(missing_vars)),
         call. = FALSE)
  }
  names(x) <- tolower(names(x))

  # select only columns of interest
  x <- x |>
    select(all_of(required_vars))

  # keep only status codes:
  #	10: preferred name
  #	20: synonym
  #	80: non-taxonomic species group (groups and aggregates)
  # see also www.aquo.nl/faq/faq-twn
  x <- x |>
    filter(.data$status %in% c(10L, 20L, 80L))

  # remove redundant spaces
  # (including leading and trailing spaces)
  x <- x |>
    mutate(across(where(is.character), strip_spaces))

  # construct TAXON
  x <- bind_rows(
    x |>
      filter(.data$status == 10) |>
      mutate(TAXON = .data$taxonname),
    x |>
      filter(.data$status == 20) |>
      mutate(TAXON = .data$prefername),
    x |>
      filter(.data$status == 80) |>
      mutate(TAXON = .data$parentname)
  )

  # for status 80 also the taxon level needs to be changed
  # to that of the parent
  id <- with(x,
    which((status == 80) &
        (taxonlevel %in% c("Genus combi", "Subgenus", "Species",
                           "Species combi", "Subspecies"))
    )
  )
  x$taxonlevel[id] <- "Genus"

  # create ordered factor of taxon levels
  x$taxonlevel <- factor(x$taxonlevel,
    levels = c(
               "Regio",
               "Regnum",
               "Phylum", "Subphylum",
               "Classis", "Subclassis", "Infraclassis",
               "Ordo", "Subordo", "Infraordo",
               "Superfamilia",	"Familia", "Subfamilia",
               "Tribe",
               "Genus", "Genus combi", "Subgenus",
               "Species", "Species combi", "Subspecies",
               "Varietas",
               "Forma"),
    ordered = TRUE
  )

  # selection
  x <- x |>
    select(GROUP = "taxongroup", LEVEL = "taxonlevel",
           FROM = "taxonname", TO = "TAXON") |>
    arrange(.data$GROUP, .data$FROM)

  # check if TO can be converted to FROM
  is_inconvertible <- is.na(x$TO) | is.na(x$FROM)
  if (any(is_inconvertible)) {
    stop(
      sprintf(
        "A total of %d taxon names in the taxon-file are inconvertible:\n%s",
        sum(is_inconvertible),
        x$FROM[is_inconvertible] |>
          sQuote() |>
          toString()
      ),
      call. = FALSE
    )
  }

  # check if all taxonomic groups are specified
  # (limits the selection of endofauna species)
  is_missing <- is.na(x$GROUP)
  if (any(is_missing)) {
    warning(
      sprintf(
        paste(
          "The taxonomic group is missing for %d records",
          "in the taxon-file.\n",
          "This may limit the removal of specific groups",
          "like decapoda and insecta."
        ),
        sum(is_missing)
      ),
      call. = FALSE
    )
  }

  # consistency check: "TO" should be part of only one "GROUP"
  d <- x |>
    select("GROUP", "TO") |>
    distinct() |>
    group_by(.data$TO) |>
    summarise(n = n(), .groups = "drop") |>
    filter(.data$n != 1L)
  is_inconsistent <- nrow(d) > 0L
  if (is_inconsistent) {
    stop(
      sprintf(
        paste(
          "Inconsistencies found in taxa file.",
          "Please check groups for taxa:\n %s"
        ),
        d$TO |>
          sQuote() |>
          toString()
      ),
      call. = FALSE
    )
  }

  # consistency check: "TO" should be part of only one "LEVEL"
  # not yet implemented since LEVEL refers to FROM and not yet to TO

  # return result
  x
}



#' Read and Validate Taxa Data
#'
#' This function reads files in the taxa format.
#'
#' @param filename name of taxa file
#'
#' @details Taxa files have the following format:
#'  \describe{
#' 	  \item{group}{taxonomic group}
#'    \item{provided}{provided taxon name}
#'    \item{accepted}{accepted taxon name}
#'    \item{level}{taxonomic level}
#' 	}
#' 	Other columns are allowed, but silently ingored.
#'
#' @importFrom readr read_csv cols_only col_character
#'
#' @export
read_taxa <- function(filename) {

  # raise error if filename is missing
  if (missing(filename)) {
    stop("Filename is missing", call. = FALSE)
  }

  # check if file exists
  if (!file.exists(filename)) {
    stop(sprintf("File %s not found", sQuote(filename)), call. = FALSE)
  }

  # read file
  d <- try(suppressMessages(read_csv(filename)), silent = TRUE)
  if (inherits(d, "try-error")) {
    stop(sprintf("Errors occurred while reading %s", sQuote(filename)),
         call. = FALSE)
  }

  # validate taxa-file
  d |>
    validate_taxa()
}



#' @describeIn read_taxa get default taxa list (TWN list extended with species
#'   Southern North Sea)
#' @importFrom readr read_rds
#' @export
get_taxa <- function() {
  read_rds(system.file("extdata", "taxa.rds", package = "benthos"))
}



#' @describeIn read_taxa validator for taxa-format
#' @param x table in taxa-format
#'
#' @importFrom dplyr all_of distinct filter group_by n select
#' @importFrom tibble as_tibble
validate_taxa <- function(x) {

  # convert to tibble
  x <- x |>
    as_tibble()

  # check column names
  required_vars <- c("group", "provided", "accepted", "level")
  missing_vars <- setdiff(required_vars, tolower(names(x)))
  if (length(missing_vars) > 0L) {
    stop(sprintf("The following columns are missing: %s",
                 toString(missing_vars)),
         call. = FALSE)
  }
  names(x) <- tolower(names(x))

  # select only columns of interest
  x <- x |>
    select(all_of(required_vars))

  # create ordered factor of taxon levels
  x$level <- factor(x$level,
    levels = c(
               "Regio",
               "Regnum",
               "Phylum", "Subphylum",
               "Classis", "Subclassis", "Infraclassis",
               "Ordo", "Subordo", "Infraordo",
               "Superfamilia",	"Familia", "Subfamilia",
               "Tribe",
               "Genus", "Genus combi", "Subgenus",
               "Species", "Species combi", "Subspecies",
               "Varietas",
               "Forma"),
    ordered = TRUE
  )

  # check if 'provided' can be converted to 'accepted'
  is_inconvertible <- is.na(x$provided) | is.na(x$accepted)
  if (any(is_inconvertible)) {
    stop(
      sprintf(
        "A total of %d taxon names in the taxon-file are inconvertible:\n%s",
        sum(is_inconvertible),
        x$provided[is_inconvertible] |>
          sQuote() |>
          toString()
      ),
      call. = FALSE
    )
  }

  # check if all taxonomic groups are specified
  # (limits the selection of endofauna species)
  is_missing <- is.na(x$group)
  if (any(is_missing)) {
    warning(
      sprintf(
        paste("The taxonomic group is missing for %d records",
              "in the taxon-file.\n",
              "This may limit the removal of specific groups",
              "like decapoda and insecta."),
        sum(is_missing)
      ),
      call. = FALSE
    )
  }

  # consistency check: "accepted" should be part of only one "group"
  d <- x |>
    select("group", "accepted") |>
    distinct() |>
    group_by(.data$accepted) |>
    summarise(n = n(), .groups = "drop") |>
    filter(.data$n != 1L)
  is_inconsistent <- nrow(d) > 0L
  if (is_inconsistent) {
    stop(sprintf(paste("Inconsistencies found in taxa file.",
                       "Please check groups for taxa:\n %s"),
                 d$accepted |>
                   sQuote() |>
                   toString()),
         call. = FALSE)
  }

  # consistency check: 'accepted' should be part of only one 'level'
  # not yet implemented since 'level' refers to 'provided'
  # and not yet to 'level' (will be checked by RWS)

  # return result
  x
}




#' Read and Validate AMBI Sensitivity Data
#'
#' This function reads and checks files with AMBI sensitivity data.
#' The data should be stored in 'comma separated values' format (csv)
#' consisting of two columns:
#' \describe{
#'   \item{TAXON}{species name;}
#'   \item{GROUP}{Roman numeral (I, II, III, IV, V) giving
#'     the sensitivity group}
#' }
#'
#' @param filename name of the AMBI sensitivity file (character)
#'
#' @details The function performs the following tasks:
#' \itemize{
#'   \item checks the existence of \code{filename};
#'   \item checks availability of required columns (case insensitive);
#'   \item removes redundant spaces;
#'   \item removes duplicated records.
#'  }
#'
#' @references Borja, A., J. Franco and V. Perez, 2000. A Marine Biotic Index
#'  to Establish the Ecological Quality of Soft-Bottom Benthos Within
#'  European Estuarine and Coastal Environments.
#'  Marine Pollution Bulletin 40:1100-1114
#'
#' @importFrom utils read.csv
#'
#' @export
read_ambi <- function(filename) {

  # check filename
  if (missing(filename)) {
    stop("Filename is missing", call. = FALSE)
  }

  # read AMBI-file
  if (!(file.exists(filename))) {
    stop(sprintf("File %s not found", sQuote(filename)), call. = FALSE)
  }

  d <- try(read.csv(file = filename, as.is = TRUE, na.strings = c("NA", "")),
           silent = TRUE)
  if (inherits(d, "try-error")) {
    stop(sprintf("Errors occurred while reading %s", sQuote(filename)),
         call. = FALSE)
  }

  # validate AMBI-file
  validate_ambi(d)
}



#' Get Supplementary AMBI Sensitivity Groups
#'
#' This function gets sensitivity groups that are supplementary to the AMBI of
#' Borja et al., (2000)
#'
#' @param which which AMBI supplement? Currently only the Dutch supplement is
#'     available (\code{which} = "NL")
#'
#' @return a data frame with columns \code{TAXON} containing taxa and
#'     \code{GROUP} containing Dutch AMBI-groups
#'
#' @references Borja, A., J. Franco and V. Perez, 2000. A Marine Biotic Index
#' to Establish the Ecological Quality of Soft-Bottom Benthos Within
#' European Estuarine and Coastal Environments.
#' Marine Pollution Bulletin 40:1100-1114
#'
#' @importFrom utils read.csv
#'
#' @export
get_ambi <- function(which = "NL") {
  switch(which,
         NL = readRDS(system.file("extdata", "ambi_nl.rds",
                                  package = "benthos")),
         stop("This AMBI-supplement is not available", call. = FALSE))
}



.get_ambi <- function() {
  frame_number <- sys.nframe()
  if (frame_number == 1L) {
    stop("Internal function, not to be called directly", call. = FALSE)
  }
  readRDS(system.file("extdata", "azti.rds", package = "benthos"))
}



#' @describeIn read_ambi validator for AMBI-format
#' @param x table in AMBI-format
validate_ambi <- function(x) {
  .validate_groups(x, permissable_groups = c("I", "II", "III", "IV", "V"))
}


#' @importFrom dplyr across distinct mutate
#' @importFrom tibble as_tibble
#' @importFrom tidyselect where
.validate_groups <- function(x, permissable_groups) {

  # convert to tibble
  x <- x |>
    as_tibble()

  # check column names (case insensitive)
  names(x) <- toupper(names(x))
  required_vars <- c("TAXON", "GROUP")
  missing_vars <- setdiff(required_vars, names(x))
  if (length(missing_vars) > 0L) {
    stop(sprintf("The following columns are missing: %s",
                 toString(missing_vars)),
         call. = FALSE)
  }

  # remove sp. from binomen with unknown species
  x$TAXON <- x$TAXON |>
    strip_sp()

  # remove redundant spaces
  x <- x |>
    mutate(across(where(is.character), strip_spaces))

  # remove duplicated records
  n <- nrow(x)
  x <- x |>
    distinct()
  n <- n - nrow(x)
  if (n != 0L) {
    message(sprintf("Number of duplicated records: %i", n))
    message("These will be removed")
  }

  # check on duplicated taxa
  if (anyDuplicated(x$TAXON)) {
    index <- which(duplicated(x$TAXON))
    stop(sprintf("Duplicated taxonnames found: %s", toString(x$TAXON[index])),
         call. = FALSE)
  }

  # check if all groups are allowed
  if (!all(x$GROUP %in% permissable_groups)) {
    stop(sprintf("sensitivity groups should be in {%s}",
                 toString(permissable_groups)),
         call. = FALSE)
  }

  # return result
  x
}



#' Get Infaunal Trophic Index
#'
#' This function gets the sensitivity groups to estimate the infaunal
#' trophic index of Gittenberger et al., (2011)
#'
#' @return a data frame with columns \code{TAXON} containing taxa and
#'     \code{GROUP} containing the ITI-groups of Gittenberger & Van Loon (2013).
#'
#' @references Gittenberger A. and  W. van Loon, 2013.
#'     Sensitivities of marine macrozoobenthos to environmental pressures
#'     in the Netherlands. Nederlandse Faunistische
#'     Mededelingen 41: 79-112.
#'
#' @export
get_iti <- function() {
  readRDS(system.file("extdata", "iti.rds", package = "benthos"))
}



#' Read and Validate Infaunal Trophic Index Files
#'
#' This function reads and checks files containing Infaunal Trophic Index
#' (ITI) data (Gittenberger & Van Loon, 2013)
#'
#' @param filename name of the ITI file (character).
#'
#' @details The function performs the following tasks:
#' \itemize{
#' 		\item checks the existence of \code{filename};
#'  	\item checks availability of required columns (case insensitive),
#'          i.e., TAXON and GROUP;
#'  	\item removes redundant spaces;
#'  	\item removes duplicated records;
#'      \item checks if all ITI classes are I, II, III, or IV
#' }
#' The column 'GROUP' contains the Roman numerals I, II, III, and IV, with
#' the following meaning:
#' \describe{
#'     	\item{  I: }{suspension feeders;}
#'  	\item{ II: }{interface feeders;}
#'  	\item{III: }{surface deposit feeders;}
#'      \item{ IV: }{subsurface deposit feeders.}
#' }
#'
#' @return A data frame with columns \code{TAXON} containing taxa and
#'      \code{GROUP} containing user-defined ITI-groups
#'       (see Gittenberger & Van Loon, 2013).
#'
#' @references Gittenberger A. and  W. van Loon, 2013.
#'      Sensitivities of marine macrozoobenthos to environmental pressures
#'      in the Netherlands. Nederlandse Faunistische
#'      Mededelingen 41: 79-112.
#'
#' @importFrom utils read.csv
#'
#' @export
read_iti <- function(filename) {

  # check if file exists
  if (missing(filename)) {
    stop("Filename is missing", call. = FALSE)
  }

  if (!file.exists(filename)) {
    stop(sprintf("File %s not found", sQuote(filename)), call. = FALSE)
  }

  # read file
  d <- try(read.csv(file = filename, as.is = TRUE, na.strings = c("NA", "")),
           silent = TRUE)
  if (inherits(d, "try-error")) {
    stop(sprintf("Errors occurred while reading %s", sQuote(filename)),
      call. = FALSE
    )
  }

  # validate ITI-file
  validate_iti(d)
}



#' @describeIn read_iti validator for ITI-format
#' @param x table in ITI-format
validate_iti <- function(x) {
  .validate_groups(x, permissable_groups = c("I", "II", "III", "IV"))
}



#' Read and Validate Habitat References Files
#'
#' This function reads and checks files with reference values
#'
#' @param filename name of the habitat reference file (\code{character})
#' @param indicators indicators to be processed (\code{character},
#'      see details)
#'
#' @details The function performs the following tasks:
#' \itemize{
#' 		\item{checks the existence of \code{filename};}
#'  	\item{checks availablitity of required columns (case insensitive);}
#'  	\item{removes redundant spaces}
#'  	\item{removes duplicated records}
#'  }
#'
#' Argument \code{indicators} is a \code{character} vector of additional benthic
#' indicators to be checked for. For example, if \code{indicators = "ITI"}, then
#' the habitat reference file should also contain columns ITIREF and ITIBAD.
#' Implemented indicators are N, LNN, S, D, SN, SNA, H, L, AMBI, ITI, PIE, N2
#' (see package vignette).
#'
#' The format of the habitat reference file is documented in the
#' BEQI2-package vignette.
#'
#' @references Van Loon, W, 2013. Loon2013-BEQI2-Specs-Ecotopes-27nov.doc
#'
#' @importFrom utils read.csv
#'
#' @export
read_ref <- function(filename, indicators = c("S", "H", "AMBI")) {

  # check if 'filename' exists
  if (!file.exists(filename)) {
    stop(sprintf("File %s not found", sQuote(filename)), call. = FALSE)
  }

  # read file
  d <- try(read.csv(file = filename, as.is = TRUE, na.strings = c("NA", "")),
           silent = TRUE)
  if (inherits(d, "try-error")) {
    stop(sprintf("Errors occurred while reading %s", sQuote(filename)),
         call. = FALSE)
  }

  # validate REF-file
  validate_ref(d, indicators = indicators)
}



#' @describeIn read_ref validator for REF-format
#' @param x table in REF-format
#'
#' @importFrom dplyr across mutate
#' @importFrom tidyselect where
validate_ref <- function(x, indicators = c("S", "H", "AMBI")) {

  # convert to tibble
  x <- x |>
    as_tibble()

  # check indicators
  valid_indicators <- c("N", "LNN", "S", "D", "SN", "SNA", "H", "L",
                        "AMBI", "ITI", "PIE", "N2")
  indicators <- toupper(indicators)
  if (length(indicators) == 0L) {
    stop(sprintf("No indicators specified. Select a subset from:\n%s",
                 toString(sQuote(valid_indicators))), call. = FALSE)
  }
  invalid_indicators <- indicators[!(indicators %in% valid_indicators)]
  if (length(invalid_indicators) > 0L) {
    stop(sprintf("Invalid indicators found:\n%s",
                 toString(sQuote(invalid_indicators))),
         call. = FALSE)
  }

  # check column names (case insensitive)
  names(x) <- toupper(names(x))
  required_vars <- c(c("OBJECTID", "RELAREA", "HABITAT"),
    paste0(rep(indicators, each = 2), c("REF", "BAD"))
  )
  missing_vars <- setdiff(required_vars, names(x))
  if (length(missing_vars) > 0L) {
    stop(sprintf("The following columns are missing: %s",
                 toString(sQuote(missing_vars))),
         call. = FALSE)
  }

  # keep only required variables
  x <- x[, required_vars]

  # remove redundant spaces
  x <- x |>
    mutate(across(where(is.character), strip_spaces))

  # remove duplicated records
  n <- nrow(x)
  x <- x |>
    distinct()
  n <- n - nrow(x)
  if (n != 0L) {
    message(sprintf("Number of duplicated records: %i", n))
    message("These will be removed")
  }

  # harmonize columns to make case-insensitive matching possible
  x <- x |>
    mutate(across(where(is.character), harmonize))

  # return result
  x
}
