###############################################################################
# Function_TemporalAggregating.R: Develop functions for aggregating values

# Written by Wout Van Echelpoel (UGent)
# Wout.VanEchelpoel@UGent.be - Jan 2022
###############################################################################

###############################################################################
# 0 - Libraries
###############################################################################
# No libraries identified
###############################################################################

###############################################################################
# 1 - Static part
###############################################################################
# No 'static' elements
###############################################################################

###############################################################################
# 2 - Script
###############################################################################
# Define helper function to calculate statistic
f.calculateStatistic <- function(data, stat = 'mean'){
  # Determine if requested statistic is included
  if(!tolower(stat) %in% c('mean', 'sd', 'se', 'median', 'min', 'max', '5perc', 
                           '10perc', '90perc', '99perc','sum')){
    stop(paste0('Options for statistic are: (1) mean, (2) sd, (3) se, ', 
                '(4) median, (5) min, (6) max, (7) 5perc, (8) 10perc, ', 
                '(9) 90perc, (10) 99perc and (11) sum'), call. = F)
  }
  
  # Determine requested statistic
  if(tolower(stat) == 'mean'){
    n.stat <- mean(data$value, na.rm = T)
  } else if(tolower(stat) == 'sd'){
    n.stat <- sd(data$value, na.rm = T)
  } else if(tolower(stat) == 'se'){
    n.stat <- sd(data$value, na.rm = T) / sqrt(nrow(data[!is.na(data$value)]))
  } else if(tolower(stat) == 'median'){
    n.stat <- median(data$value, na.rm = T)
  } else if(tolower(stat) == 'min'){
    n.stat <- min(data$value, na.rm = T)
  } else if(tolower(stat) == 'max'){
    n.stat <- max(data$value, na.rm = T)
  } else if(tolower(stat) == '5perc'){
    n.stat <- quantile(data$value, probs = 0.05, na.rm = T)
  } else if(tolower(stat) == '10perc'){
    n.stat <- quantile(data$value, probs = 0.10, na.rm = T)
  } else if(tolower(stat) == '90perc'){
    n.stat <- quantile(data$value, probs = 0.90, na.rm = T)
  } else if(tolower(stat) == '99perc'){
    n.stat <- quantile(data$value, probs = 0.99, na.rm = T)
  } else if(tolower(stat) == 'sum'){
    n.stat <- sum(data$value, na.rm = T)
  }
  # Return calculated statistic
  return(n.stat)
}

# Define function for monthly aggregation per station
f.dailyPerStation <- function(data, stat = 'mean'){
  # Determine if all required columns are present
  if(length(which(c('datetime', 'stationname', 'value') %in% names(data))) != 3){
    stop('Required input: (1) datetime, (2) stationname and (3) value')
  } else {
    message(paste0('Calculate ', stat, ' per day and per station'))
  }
  
  # Remove stations without stationname
  message(paste0("---Removed ", sum(is.na(data$stationname)),
                 " observations without stationname---"))
  data <- data[!is.na(data$stationname), ]
  
  # Simplify datetime to date
  data$date <- as.Date(format(data$datetime, '%Y-%m-%d'))
  
  # Split data conditional to zone and date
  lst.dat <- split(data, list(data$stationname, data$date), drop = T)
  
  # Define output list
  lst.day <- list()
  
  # For each subset, calculate statistic if more than 1 observation
  for (i in c(1:length(lst.dat))){
    # Add general information to output
    v.names <- c('latitude', 'longitude', 'stationname', 'segment', 'zone', 
                 'region', 'datetime', 'year', 'month', 'parametername', 
                 'unit', 'value', 'valuesign', 'dataprovider')
    lst.day[[i]] <- lst.dat[[i]][1, names(lst.dat[[i]]) %in% v.names]
    
    # If more than one observation, adapt information
    if(nrow(lst.dat[[i]]) > 1){
      # Determine requested statistic and add to output
      n.stat <- f.calculateStatistic(lst.dat[[i]], stat = stat)
      lst.day[[i]]$value <- n.stat
      
      # Determine (new) datetime and add to output
      if(stat %in% c('min', 'max')){
        lst.day[[i]]$datetime <- 
          max(lst.dat[[i]]$datetime[which(lst.dat[[i]]$value == n.stat)])
      } else {
        if(i == 1){ message('---Using mean of datetimes for new value---')}
        lst.day[[i]]$datetime <- mean(lst.dat[[i]]$datetime)
      }
    }
  }
  
  # Merge individual statistics and return sorted overview
  df.day <- do.call('rbind', lst.day)
  message(paste0('---Reduced data with ', nrow(data) - nrow(df.day), 
                 ' observations through merging---'))
  message(paste0('---Daily ', stat, ' per station calculated---'))
  return(df.day[order(df.day$stationname, df.day$datetime), ])
}

# Define function for monthly aggregation per station
f.monthlyPerStation <- function(data, stat = 'mean'){
  # Determine if all required columns are present
  if(length(which(c('year', 'month', 'stationname', 'value') %in% names(data))) != 4){
    stop('Required input: (1) year, (2) month, (3) stationname and (4) value')
  } else {
    message(paste0('Calculate ', stat, ' per month and per station'))
  }
  
  # Remove stations without stationname
  message(paste0("---Removed ", sum(is.na(data$stationname)),
                 " observations without stationname---"))
  data <- data[!is.na(data$stationname), ]
  
  # Split data conditional to zone, year and month
  lst.dat <- split(data, list(data$stationname, data$year, data$month), drop = T)

  # Define output list
  lst.mnth <- list()
  
  # For each subset, calculate statistic
  for (i in c(1:length(lst.dat))){
    # Add general information to output
    v.names <- c('latitude', 'longitude', 'stationname', 'parametername',
                 'unit', 'segment', 'zone', 'year', 'month')
    lst.mnth[[i]] <- lst.dat[[i]][1, names(lst.dat[[i]]) %in% v.names]
    
    # Determine requested statistic and add to output
    lst.mnth[[i]]$value <- f.calculateStatistic(lst.dat[[i]], stat = stat)
  }
  
  # Merge individual statistics and return sorted overview
  df.mnth <- do.call('rbind', lst.mnth)
  message(paste0('---Monthly ', stat, ' per station calculated---'))
  return(df.mnth[order(df.mnth$stationname, df.mnth$year, df.mnth$month), ])
}

# Define function for monthly aggregation per segment (level 4)
f.monthlyPerSegment <- function(data, stat = 'mean'){
  # Determine if all required columns are present
  if(length(which(c('year', 'month', 'segment', 'value') %in% names(data))) != 4){
    stop('Required input: (1) year, (2) month, (3) segment and (4) value')
  } else {
    message(paste0('Calculate ', stat, ' per month and per segment'))
  }
  
  # Remove stations without segment
  message(paste0("---Removed ", sum(is.na(data$segment)),
                 " observations without segment---"))
  data <- data[!is.na(data$segment), ]

  # Split data conditional to zone, year and month
  lst.dat <- split(data, list(data$segment, data$year, data$month), drop = T)

  # Define output list
  lst.mnth <- list()
  
  # For each subset, calculate statistic
  for (i in c(1:length(lst.dat))){
    # Add general information to output
    v.names <- c('segment', 'zone', 'year', 'month', 'parametername', 'unit')
    lst.mnth[[i]] <- lst.dat[[i]][1, names(lst.dat[[i]]) %in% v.names]
    
    # Determine requested statistic and add to output
    lst.mnth[[i]]$value <- f.calculateStatistic(lst.dat[[i]], stat = stat)
  }
  
  # Merge individual statistics and return sorted overview
  df.mnth <- do.call('rbind', lst.mnth)
  message(paste0('---Monthly ', stat, ' per segment calculated---'))
  return(df.mnth[order(df.mnth$segment, df.mnth$year, df.mnth$month), ])
}

# Define function for monthly aggregation per zone (level 3)
f.monthlyPerZone <- function(data, stat = 'mean'){
  # Determine if all required columns are present
  if(length(which(c('year', 'month', 'zone', 'value') %in% names(data))) != 4){
    stop('Required input: (1) year, (2) month, (3) zone and (4) value')
  } else {
    message(paste0('Calculate ', stat, ' per month and per salinity zone'))
  }
  
  # Remove stations without zone
  message(paste0("---Removed ", sum(is.na(data$zone)),
                " observations without zone---"))
  data <- data[!is.na(data$zone), ]
  
  # Split data conditional to zone, year and month
  lst.dat <- split(data, list(data$zone, data$year, data$month), drop = T)
  
  # Define output list
  lst.mnth <- list()
  
  # For each subset, calculate statistic
  for (i in c(1:length(lst.dat))){
    # Add general information to output
    v.names <- c('zone', 'year', 'month', 'parametername', 'unit')
    lst.mnth[[i]] <- lst.dat[[i]][1, names(lst.dat[[i]]) %in% v.names]
    
    # Determine requested statistic and add to output
    lst.mnth[[i]]$value <- f.calculateStatistic(lst.dat[[i]], stat = stat)
  }
  
  # Merge individual statistics and return sorted overview
  df.mnth <- do.call('rbind', lst.mnth)
  message(paste0('---Monthly ', stat, ' per salinity zone calculated---'))
  return(df.mnth[order(df.mnth$zone, df.mnth$year, df.mnth$month), ])
}

# Define function for yearly aggregation per station
f.yearlyPerStation <- function(data, stat = 'mean'){
  # Determine if all required columns are present
  if(length(which(c('year', 'stationname', 'value') %in% names(data))) != 3){
    stop('Required input: (1) year, (2) stationname and (3) value')
  } else {
    message(paste0('Calculate ', stat, ' per year and per station'))
  }
  
  # Remove stations without stationname
  message(paste0("---Removed ", sum(is.na(data$stationname)), 
                 " observations without stationname---"))
  data <- data[!is.na(data$stationname), ]

  # Split data conditional to station and year
  lst.dat <- split(data, list(data$stationname, data$year), drop = T)
  
  # Define output list
  lst.yr <- list()
  
  # For each subset, calculate statistic
  for (i in c(1:length(lst.dat))){
    # Add general information to output
    v.names <- c('latitude', 'longitude', 'stationname', 'parametername',
                 'unit', 'segment', 'zone', 'year')
    lst.yr[[i]] <- lst.dat[[i]][1, names(lst.dat[[i]]) %in% v.names]
    
    # Determine requested statistic and add to output
    lst.yr[[i]]$value <- f.calculateStatistic(lst.dat[[i]], stat = stat)
  }
  
  # Merge individual statistics and return sorted overview
  df.yr <- do.call('rbind', lst.yr)
  message(paste0('---Yearly ', stat, ' per station calculated---'))
  return(df.yr[order(df.yr$stationname, df.yr$year), ])
}

# Define function for yearly aggregation per segment (level 4)
f.yearlyPerSegment <- function(data, stat = 'mean'){
  # Determine if all required columns are present
  if(length(which(c('year', 'segment', 'value') %in% names(data))) != 3){
    stop('Required input: (1) year, (2) segment and (3) value')
  } else {
    message(paste0('Calculate ', stat, ' per year and per segment'))
  }
  
  # Remove stations without segment
  message(paste0("---Removed ", sum(is.na(data$segment)), 
                 " observations without segment---"))
  data <- data[!is.na(data$segment), ]
  
  # Split data conditional to segment and year
  lst.dat <- split(data, list(data$segment, data$year), drop = T)
  
  # Define output list
  lst.yr <- list()
  
  # For each subset, calculate statistic
  for (i in c(1:length(lst.dat))){
    # Add general information to output
    v.names <- c('segment', 'zone', 'year', 'parametername', 'unit')
    lst.yr[[i]] <- lst.dat[[i]][1, names(lst.dat[[i]]) %in% v.names]
    
    # Determine requested statistic and add to output
    lst.yr[[i]]$value <- f.calculateStatistic(lst.dat[[i]], stat = stat)
  }
  
  # Merge individual statistics and return sorted overview
  df.yr <- do.call('rbind', lst.yr)
  message(paste0('---Yearly ', stat, ' per segment calculated---'))
  return(df.yr[order(df.yr$segment, df.yr$year), ])
}

# Define function for yearly aggregation per zone (level 3)
f.yearlyPerZone <- function(data, stat = 'mean'){
  # Determine if all required columns are present
  if(length(which(c('year', 'zone', 'value') %in% names(data))) != 3){
    stop('Required input: (1) year, (2) zone and (3) value')
  } else {
    message(paste0('Calculate ', stat, ' per year and per zone'))
  }

  # Remove stations without zone
  message(paste0("---Removed ", sum(is.na(data$zone)), 
                 " observations without zone---"))
  data <- data[!is.na(data$zone), ]
  
  # Split data conditional to zone and year
  lst.dat <- split(data, list(data$zone, data$year), drop = T)
  
  # Define output list
  lst.yr <- list()
  
  # For each subset, calculate statistic
  for (i in c(1:length(lst.dat))){
    # Add general information to output
    v.names <- c('zone', 'year', 'parametername', 'unit')
    lst.yr[[i]] <- lst.dat[[i]][1, names(lst.dat[[i]]) %in% v.names]
    
    # Determine requested statistic and add to output
    lst.yr[[i]]$value <- f.calculateStatistic(lst.dat[[i]], stat = stat)
  }
  
  # Merge individual statistics and return sorted overview
  df.yr <- do.call('rbind', lst.yr)
  message(paste0("---Yearly ", stat, " per salinity zone calculated---"))
  return(df.yr[order(df.yr$zone, df.yr$year), ])
}

# Define function for year range aggregation per station
f.periodPerStation <- function(data, stat = 'mean', reduce = FALSE){
  # Determine if all required columns are present
  if(length(which(c('year', 'stationname', 'value') %in% names(data))) != 3){
    stop('Required input: (1) year, (2) stationname and (3) value')
  } else {
    message(paste0('Calculate ', stat, ' per period and per station'))
  }
  
  # Remove stations without stationname
  message(paste0("---Removed ", sum(is.na(data$stationname)), 
                 " observations without stationname---"))
  data <- data[!is.na(data$stationname), ]
  
  # Determine reporting period
  n.per <- paste0(min(data$year, na.rm = T), '-', max(data$year, na.rm = T))
  
  # Split data conditional to station
  lst.dat <- split(data, list(data$stationname), drop = T)
  
  # Define output list
  lst.prd <- list()
  
  # For each subset, calculate statistic
  for (i in c(1:length(lst.dat))){
    # Add general information to output
    v.names <- c('latitude', 'longitude', 'stationname', 'parametername',
                 'segment', 'zone')
    lst.prd[[i]] <- lst.dat[[i]][1, names(lst.dat[[i]]) %in% v.names]
    
    # Add additional information (type and data availability period)
    lst.prd[[i]]$type <- 'Station'
    lst.prd[[i]]$data <- n.per
    
    # Determine requested statistic and add unit
    lst.prd[[i]]$value <- f.calculateStatistic(lst.dat[[i]], stat = stat)
    lst.prd[[i]]$unit <- ifelse('unit' %in% names(lst.dat[[i]]), 
                                lst.dat[[i]]$unit[1], NA)
  }
  
  # Merge individual statistics and simplify (if requested)
  df.prd <- do.call('rbind', lst.prd)
  df.prd <- df.prd[order(df.prd$stationname), ]
  if(reduce){
    df.prd <- df.prd[, c('stationname', 'type', 'data', 'value', 'unit')]
    names(df.prd)[1] <- 'location'
  }
  
  # Return sorted overview
  message(paste0('---Periodical ', stat, ' per station calculated---'))
  return(df.prd)
}

# Define function for year range aggregation per segment (level 4)
f.periodPerSegment <- function(data, stat = 'mean', reduce = FALSE){
  # Determine if all required columns are present
  if(length(which(c('year', 'segment', 'value') %in% names(data))) != 3){
    stop('Required input: (1) year, (2) segment and (3) value')
  } else {
    message(paste0('Calculate ', stat, ' per period and per segment'))
  }
  
  # Remove stations without segment
  message(paste0("---Removed ", sum(is.na(data$segment)), 
                 " observations without segment---"))
  data <- data[!is.na(data$segment), ]
  
  # Determine reporting period
  n.per <- paste0(min(data$year, na.rm = T), '-', max(data$year, na.rm = T))
  
  # Split data conditional to segment
  lst.dat <- split(data, list(data$segment), drop = T)
  
  # Define output list
  lst.prd <- list()
  
  # For each subset, calculate statistic
  for (i in c(1:length(lst.dat))){
    # Add general information to output
    v.names <- c('parametername', 'segment', 'zone')
    lst.prd[[i]] <- lst.dat[[i]][1, names(lst.dat[[i]]) %in% v.names]
    
    # Add additional information (type and data availability period)
    lst.prd[[i]]$type <- 'Segment'
    lst.prd[[i]]$data <- n.per
    
    # Determine requested statistic and add unit
    lst.prd[[i]]$value <- f.calculateStatistic(lst.dat[[i]], stat = stat)
    lst.prd[[i]]$unit <- ifelse('unit' %in% names(lst.dat[[i]]), 
                                lst.dat[[i]]$unit[1], NA)
  }
  
  # Merge individual statistics and simplify (if requested)
  df.prd <- do.call('rbind', lst.prd)
  df.prd <- df.prd[order(df.prd$segment), ]
  if(reduce){
    df.prd <- df.prd[, c('segment', 'type', 'data', 'value', 'unit')]
    names(df.prd)[1] <- 'location'
  }
  
  # Return sorted overview
  message(paste0('---Periodical ', stat, ' per segment calculated---'))
  return(df.prd)
}

# Define function for year range aggregation per zone (level 3)
f.periodPerZone <- function(data, stat = 'mean', reduce = FALSE){
  # Determine if all required columns are present
  if(length(which(c('year', 'zone', 'value') %in% names(data))) != 3){
    stop('Required input: (1) year, (2) zone and (3) value')
  } else {
    message(paste0('Calculate ', stat, ' per period and per salinity zone'))
  }
  
  # Remove stations without zone
  message(paste0("---Removed ", sum(is.na(data$zone)), 
                 " observations without zone---"))
  data <- data[!is.na(data$zone), ]
  
  # Determine reporting period
  n.per <- paste0(min(data$year, na.rm = T), '-', max(data$year, na.rm = T))
  
  # Split data conditional to zone
  lst.dat <- split(data, list(data$zone), drop = T)
  
  # Define output list
  lst.prd <- list()
  
  # For each subset, calculate statistic
  for (i in c(1:length(lst.dat))){
    # Add general information to output
    v.names <- c('parametername', 'zone')
    lst.prd[[i]] <- lst.dat[[i]][1, names(lst.dat[[i]]) %in% v.names]
    
    # Add additional information (type and data availability period)
    lst.prd[[i]]$type <- 'Zone'
    lst.prd[[i]]$data <- n.per
    
    # Determine requested statistic and add unit
    lst.prd[[i]]$value <- f.calculateStatistic(lst.dat[[i]], stat = stat)
    lst.prd[[i]]$unit <- ifelse('unit' %in% names(lst.dat[[i]]), 
                                lst.dat[[i]]$unit[1], NA)
  }
  
  # Merge individual statistics and simplify (if requested)
  df.prd <- do.call('rbind', lst.prd)
  df.prd <- df.prd[order(df.prd$zone), ]
  if(reduce){
    df.prd <- df.prd[, c('zone', 'type', 'data', 'value', 'unit')]
    names(df.prd)[1] <- 'location'
  }
  
  # Return sorted overview
  message(paste0('---Periodical ', stat, ' per salinity zone calculated---'))
  return(df.prd)
}
###############################################################################

###############################################################################
# 3 - Remove excess variables
###############################################################################
# No excess variables
###############################################################################