#' Pool qualified clients from results of multiple definitions
#'
#' @md
#' @description
#' This function filters and pools, i.e., row bind, qualified clients/groups from different source with an option to summarize by client. Unlike [bind_source()], no need to supply variable names; the function will guess what should be included and their names from the supplied definition from [build_def()]. Whether a client is qualified relies on the flag variables set by [define_case()]. Therefore, this function is intended to be use only with the built-in [define_case()] as `def_fn` in [build_def()].
#'
#' @param data A list of data.frame or remote table which should be output from [execute_def()].
#' @param def A tibble of case definition generated by [build_def()].
#' @param output_lvl Either:
#' * "raw" - output all records (default),
#' * or "clnt" - output one record per client with summaries including date and source of the first valid record ('first_valid_date/src'), and the latest record ('last_entry_date/src'). Source-specific record counts are also provided (see the return section).
#' @param include_src Character. It determines records from which sources should be included. This matters when clients were identified only from, not all, but some of the sources. This choice will not impact the number of client that would be identified but has impact on the number of records and the latest entry date. The options are one of:
#' * "all" - records from all sources are included;
#' * "has_valid" - for each client, records from sources that contain at least one valid record are included;
#' * "n_per_clnt" - for each client, if they had fewer than `n_per_clnt` records in a source (see [restrict_n()]), then records from that source are removed.
#' @param ... Additional arguments passing to [bind_source()]
#'
#' @return A data.frame or remote table with clients that satisfied the predefined case definition. Columns started with "raw_in_" are source-specific counts of raw records, and columns started with "valid_in_" are the number of valid entries (or the number of flags) in each source.
#' @export
#'
#' @examples
#' # toy data
#' df1 <- make_test_dat()
#' df2 <- make_test_dat()
#'
#' # use build_def to make a toy definition
#' sud_def <- build_def("SUD", # usually a disease name
#'   src_lab = c("src1", "src2"), # identify from multiple sources, e.g., hospitalization, ED visits.
#'   # functions that filter the data with some criteria
#'   def_fn = define_case,
#'   fn_args = list(
#'     vars = starts_with("diagx"),
#'     match = "start", # "start" will be applied to all sources as length = 1
#'     vals = list(c("304"), c("305")),
#'     clnt_id = "clnt_id", # list()/c() could be omitted for single element
#'     # c() can be used in place of list
#'     # if this argument only takes one value for each source
#'     n_per_clnt = c(2, 3)
#'   )
#' )
#'
#' # save the definition for re-use
#' # saveRDS(sud_def, file = some_path)
#'
#' # execute definition
#' sud_by_src <- sud_def %>% execute_def(with_data = list(src1 = df1, src2 = df2))
#'
#' # pool results from src1 and src2 together at client level
#' pool_case(sud_by_src, sud_def, output_lvl = "clnt")
pool_case <- function(data, def, output_lvl = c("raw", "clnt"), include_src = c("all", "has_valid", "n_per_clnt"), ...) {
  . <- clnt_id <- date_var <- flag_restrict_date <- flag_restrict_n <- flag_valid_record <- src <- max_date <- first_valid_src <- last_entry_src <- NULL

  output_lvl <- rlang::arg_match0(output_lvl, c("raw", "clnt"))
  include_src <- rlang::arg_match0(include_src, c("all", "has_valid", "n_per_clnt"), )

  # get variable names to keep from def
  dot <- def_to_dot(def)

  # bind data with obtained names
  bind_data <- bind_source(data, !!!dot, ...)
  is_df <- is.data.frame(bind_data)

  # interpret has date_var or not for later whether dates need to be summarized
  has_date_var <- ifelse("date_var" %in% names(dot), "y", "n")
  partial_date_var <- any(is.na(dot[["date_var"]]))

  # treat missing date_var for window order
  # this has to be up front before the first aggregate function
  order_df <- c("def", "clnt_id", "date_var", "src", "uid")
  order_df <- order_df[order_df %in% names(dot)]
  win_order <- c("date_var", "src", "uid")
  if (partial_date_var) {
    order_df <- order_df[order_df != "date_var"]
  }

  # flag calculation
  # if not flag, assume 1
  has_n_flag <- "flag_restrict_n" %in% names(dot)
  has_date_flag <- "flag_restrict_date" %in% names(dot)

  if (has_date_flag) {
    bind_data <- bind_data %>%
      dplyr::mutate(flag_valid_record = flag_restrict_date)
    fill_src <- def$src_labs[which(is.na(dot[["flag_restrict_date"]]))]
  } else if (has_n_flag) {
    bind_data <- bind_data %>%
      dplyr::mutate(flag_valid_record = flag_restrict_n)
    fill_src <- def$src_labs[which(is.na(dot[["flag_restrict_n"]]))]
  }

  # browser()

  if (has_date_flag | has_n_flag) {
    # there might be cases no need to fill
    if (length(fill_src > 0)) {
      bind_data <- bind_data %>%
        dplyr::mutate(flag_valid_record = dplyr::case_when(src %in% local(fill_src) ~ 1L,
          .default = flag_valid_record
        ))
    }
  } else {
    bind_data <- bind_data %>%
      dplyr::mutate(flag_valid_record = 1L)
  }
  # browser()

  switch(include_src,
    all = {
      bind_data <- bind_data %>%
        dplyr::group_by(def, clnt_id) %>%
        flex_order(is_df = is_df, win_order = win_order) %>%
        dplyr::filter(max(flag_valid_record, na.rm = TRUE) > 0L)
    },
    has_valid = {
      bind_data <- bind_data %>%
        dplyr::group_by(def, clnt_id, src) %>%
        flex_order(is_df = is_df, win_order = c("date_var", "uid")) %>%
        dplyr::filter(max(flag_valid_record, na.rm = TRUE) > 0L) %>%
        dplyr::group_by(def, clnt_id) %>%
        flex_order(is_df = is_df, win_order = win_order)
    },
    n_per_clnt = {
      if (has_n_flag) {
        # there might be cases no need to fill
        fill_n <- def$src_labs[which(is.na(dot[["flag_restrict_n"]]))]
        if (length(fill_n > 0)) {
          bind_data <- bind_data %>%
            dplyr::mutate(flag_restrict_n = dplyr::case_when(src %in% local(fill_n) ~ 1L,
              .default = flag_restrict_n
            ))
        }
        bind_data <- bind_data %>%
          dplyr::filter(flag_restrict_n > 0L) %>%
          dplyr::group_by(def, clnt_id) %>%
          flex_order(is_df = is_df, win_order = win_order)
      } else {
        stop("Input data does not contain flag_restrict_n")
      }
    }
  )

  # browser()

  # job done if output raw
  # lines after this return is for sum by clients
  if (output_lvl == "raw") {
    if (partial_date_var) {
      warning("'date_var' is missing in some of the sources. Records cannot be sorted by dates.")
    }
    if (is_df) {
      bind_data <- dplyr::ungroup(bind_data) %>%
        dplyr::arrange(dplyr::pick(dplyr::any_of(order_df)))
    } else {
      bind_data <- clean_db(bind_data)
    }
    return(bind_data)
  }

  if (has_date_var == "y") {
    if (partial_date_var) {
      has_date_var <- "n"
      warning("'date_var' is missing in some of the sources. First/last date of valid records cannot be determined at client level")
    }
  }

  # start with getting the common steps for both data.frame and database
  # including get the last date
  # then filter valid entries only
  # branching by data type to sum first valid date and valid sources

  if (has_date_var == "y") {
    bind_data <- bind_data %>%
      dplyr::mutate(
        max_date = max(date_var, na.rm = TRUE),
        last_entry_src = dplyr::last(src)
      )
  }

  # getting source indicators
  src_nm <- unique(def[["src_labs"]])
  src_formula <- purrr::map(src_nm, function(x) rlang::new_formula(NULL, rlang::expr(. == !!x)))
  names(src_formula) <- glue::glue("in_{src_nm}")
  bind_data <- bind_data %>%
    dplyr::mutate(dplyr::across(src, list(!!!src_formula), .names = "{.fn}"))

  # preserve raw record count before filter out valid records
  bind_data <- bind_data %>%
    dplyr::mutate(dplyr::across(dplyr::starts_with("in_"), ~ sum(as.integer(.), na.rm = TRUE), .names = "raw_{.col}"))

  bind_data <- bind_data %>%
    dplyr::filter(flag_valid_record == 1)

  switch(has_date_var,
    y = {
      bind_data <- bind_data %>%
        dplyr::mutate(first_valid_src = dplyr::first(src)) %>%
        dplyr::summarise(
          first_valid_date = min(date_var, na.rm = TRUE),
          first_valid_src = min(first_valid_src, na.rm = TRUE),
          last_entry_date = max(max_date, na.rm = TRUE),
          last_entry_src = min(last_entry_src, na.rm = TRUE),
          dplyr::across(dplyr::starts_with("raw_"), ~ mean(., na.rm = TRUE)),
          dplyr::across(dplyr::starts_with("in_"), ~ sum(as.integer(.), na.rm = TRUE), .names = "valid_{.col}")
        )
    },
    n = {
      bind_data <- bind_data %>%
        dplyr::summarise(
          dplyr::across(dplyr::starts_with("raw_"), ~ mean(., na.rm = TRUE)),
          dplyr::across(dplyr::starts_with("in_"), ~ sum(as.integer(.), na.rm = TRUE), .names = "valid_{.col}")
        )
    }
  )

  if (is_df) {
    bind_data <- dplyr::ungroup(bind_data) %>%
      dplyr::arrange(dplyr::pick(dplyr::any_of(order_df)))
  } else {
    bind_data <- clean_db(bind_data)
  }

  return(bind_data)
}

flex_order <- function(data, is_df, win_order) {
  if (!is_df) {
    data <- data %>%
      dbplyr::window_order(dplyr::pick(dplyr::any_of(win_order)))
  }
  return(data)
}
