commintng missing things form previous commit

melmasri · Nov 26, 2020 · fabd4f3 · fabd4f3
1 parent b40403e
commit fabd4f3
Show file tree

Hide file tree

Showing 11 changed files with 144 additions and 106 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,9 +1,11 @@
 # Generated by roxygen2: do not edit by hand
 
 S3method(predict,traveltimeCLT)
+S3method(predict,traveltimeCLT.population)
 S3method(predict,traveltimeCLT.trip_specific)
 export(from_to_format)
 export(link_mean_variance)
+export(population_estimates)
 export(residual_autocorrelation)
 export(residual_variance)
 export(rules2timebins)

diff --git a/R/from_to_format.R b/R/from_to_format.R
@@ -2,11 +2,11 @@
 #'
 #' \code{from_to_format} adjust the data to from linkID to linkID format.
 #' 
-#' @param data A data frame of trips and their road level travel information, formated as \code{trips}, see \code{trips} or \code{View(data(trips))}.
+#' @param data A data frame of trips and their road level travel information, formatted as \code{trips}, see \code{trips} or \code{data(trips); View(trips)}.
 #'
 #' @details NULL
 #'
-#' @return \code{from_to_format} returns the data frame with extra columns (linkID.from, linkID.to), and \code{N} representing the number of (i = linkID.from, j = linkID.to, k = timeBin) is present in the dataset.
+#' @return Returns a data frame with extra columns (linkID.from, linkID.to), and \code{N} representing the number of (i = linkID.from, j = linkID.to, k = timeBin) is present in the dataset.
 #' 
 #' @examples
 #' \dontrun{
@@ -21,5 +21,5 @@ from_to_format <- function(data){
     dt[, linkID.from := linkID, by = tripID]
     dt[, linkID.to := shift(linkID, type = 'lead'), by = tripID]
     dt[, N:=.N, by =list(linkID.from, linkID.to, timeBin) ]
-    dt
+    invisible(dt)
 }
diff --git a/R/link_mean_variance.R b/R/link_mean_variance.R
@@ -1,14 +1,20 @@
-#' This function allows to create the graph of the network and run the algorithm on the train set to get the mean of the autocorrelation and the mean of the residuals.
-#' @param data A data frame of trips and their road level travel information, formated as \code{trips}, see \code{trips} or \code{View(data(trips))}.
-#' @param L minimum number of observation to estimate (and not impute) parameters. Default (\code{L=5}).
+#' Estimates mean and variance of each link
+#'
+#' \code{link_mean_variance} estimates the mean and variance of travel duration for each link in the data.
+#'
+#' @param data A data frame of trips and their road level travel information, formatted as \code{trips}, see \code{trips} or \code{data(trips); View(trips)}.
+#' @param L The minimum number of observation needed to estimate (and not impute) parameters. Default (\code{L=5}). see 
 #' @param bins a vector of predefined naming for time bins. Default \code{unique(data$timeBin)}.
+#'
+#' @details For links \code{(linkID.from, linkID.to, timeBin)} that have less than \code{L} number of observations, first they are imputed by \code{(linkID.from, timeBin)} estimates, if there at least \code{L} observations in that category, and second by \code{timeBin} estimates.
 #' 
-#' @details returns a data frame with columns \code{(linkID.from, linkID.to, timeBin, mean, sd, imputed_mean, imputed_sd)} representing the mean and as used for each unit, while \code{imputed_mean} and \code{imputed_sd} indicate whether the calculated quantity is imputed from time bin estimates, or calculated from observed data.
+#' @return Returns a hashed environment with keys as \code{paste0(linkID.from,'.', linkID.to,'.', timeBin)} and values containing \code{list(mean, sd, imputed_mean, imputed_sd)} as the mean and standard deviation of travel time for each unit, while \code{imputed_mean} and \code{imputed_sd} indicate whether the calculated quantities are imputed or calculated from observed data.
+#'
 #' @examples
 #'#' @examples
 #' \dontrun{
 #' data(trips)
-#' estimate_edge_parameters(trips)
+#' link_mean_variance(trips)
 #' }
 #' @import data.table
 #' @export

diff --git a/R/predict.R b/R/predict.R
@@ -1,39 +1,40 @@
-#' This function allows to predict the travel time estimation on the test set.
+#' Predict travel time
 #'
-#' @param object an output of \code{traveltimeCLT}, of class \code{traveltimeCLT}.
-#' @param newdata a data frame of new trips.
-#' @param pred.intervals Type of prediction itervals, \code{trip-specific}, \code{population}, or \code{none}.
-#' @param level Tolerance levels.
+#' \code{predict.traveltimeCLT} predicts expected travel time (ETA) and a prediction interval based on the \code{trip-specific} and \code{population} methods.
+#'
+#' @param object An output of \code{traveltimeCLT}, of class \code{traveltimeCLT}.
+#' @param newdata  A data frame of new trips and their road level travel information, formatted as \code{trips}, see \code{trips} or \code{data(trips); View(trips)}.
+#' @param level Significance levels.
+#'
+#' @details Both the \code{trip-specific} and \code{population} prediction intervals are Gaussian-based. 
+#' 
+#' @return Returns a data fram that inlcudes the ETA (and optionally the trips variance), with lower and upper prediction intervals (optional), for each trip in the \code{newdata}.
 #' @examples
 #' \dontrun{
-#'
 #'}
 #' @import data.table
 #' @export
-predict.traveltimeCLT <- function(object, newdata,
-                                  pred.interval = c('trip-specific', 'population', 'none'),
-                                  level = 0.95){
+predict.traveltimeCLT <- function(object, newdata, level = 0.95){
 
-    interval <- tryCatch(match.arg(pred.interval),error=function(cond){
-        stop("Parameter 'pred.interval' should be eiter 'trip-specific', 'population', or 'none'")
-    })
-
     if(!'data.table' %in% class(newdata))  newdata = data.table(newdata)[order(tripID, entry_time)]
 
-    pred = newdata[, predict.traveltimeCLT.trip_specific(entry_time[1], linkID, distance_meters,
-                                                         object$rho$average_correlation,
-                                                         object$network_parameters,
-                                                         finaly.only = TRUE,
-                                                         pred.type = object$estimate
-                                                         ),by = tripID]
-
-    q = qnorm(level)
-    if(grepl('both', object$estimate))
-        if(grepl('trip-specific', interval)){
+    if(grepl('trip-specific', object$model)){
+        pred = newdata[, predict.traveltimeCLT.trip_specific(entry_time[1], linkID, distance_meters,
+                                                             object$network_parameters, object$rho$average_correlation,
+                                                             finaly.only = TRUE,
+                                                             pred.type = object$estimate
+                                                             ),by = tripID]
+        q = qnorm(level)
+        if(grepl('both', object$estimate)){            
             v = sqrt(object$residual_variance)
             pred[, lwr := ETA - q * v * sqrt(variance)]
             pred[, upr := ETA + q * v * sqrt(variance)]
         }
+    }
+    if(grepl('population', object$model))
+        pred = newdata[, predict.traveltimeCLT.population(.N, object, level = level),
+                       by = tripID]
+
     return(data.frame(pred))
 }
 
diff --git a/R/predict.traveltimeCLT.trip_specific.R b/R/predict.traveltimeCLT.trip_specific.R
@@ -1,24 +1,29 @@
+#' Predict trip-specific travel time
+#'
 #' \code{predict.traveltimeCLT.trip_specific} returns the predicted mean and variance of travel time for a specific route and start time.
 #'
-#' @param starttime a \code{POSIXlt} value representing the start time of the trip.
-#' @param route a vector of links in the order to be traveled.
-#' @param distance a vector of distances to be traveled on each on the links in \code{route}.
-#' @param rho a vector of auto-correlation in the order of lag, starting at lag 0.
-#' @param network_parameters an output of \code{link_mean_variance}, see \code{?link_mean_variance}.
-#' @param final.only a logical indicating whether to return a sequence of means and standard deviation, in the order of \code{route}, or only the final ETA and standard deviation values. Default \code{final.only=TRUE}.
-#' @param timebin_rules a \code{list} of time bin rules to be passed to \code{rules2timebins}, see \code{?rules2timebins}. 
+#' @param starttime A \code{POSIXlt} value representing the start time of the trip.
+#' @param route Vector of links in the order of  travel.
+#' @param distance Vector of distances to be traveled on each on the links in \code{route}.
+#' @param network_parameters An output of \code{link_mean_variance}, see \code{?link_mean_variance}.
+#' @param rho Victor of auto-correlation in the order of lag, starting at lag 0 for no correlation.
+#' @param final.only Logical indicating whether to return a sequence of means and standard deviations, in the order of \code{route}, or only the final ETA and standard deviation. Default \code{final.only=TRUE}.
+#' @param timebin_rules A \code{list} of time bin rules to be passed to \code{rules2timebins}, see \code{?rules2timebins}.
+#' @param pred.type \code{'both'} to predict the mean and variance of travel time and \code{'mean-only'} for the mean. Default \code{'both'}.
 #'
 #' @details NULL
 #'
+#' @return Returns a list of predictions.
+#' 
 #' @examples
 #' \dontrun{
 #'
 #' }
 #' @import data.table
 #' @export
 predict.traveltimeCLT.trip_specific <- function(starttime, route, distance,
-                                                rho=1,
                                                 network_parameters,
+                                                rho=1,
                                                 finaly.only = TRUE,
                                                 timebin_rules = NULL,
                                                 pred.type = c('both', 'mean-only')){
@@ -41,9 +46,6 @@ predict.traveltimeCLT.trip_specific <- function(starttime, route, distance,
     if(length(route) != length(distance))
         stop('length(route) != length(distance)!')
 
-    if(length(rho) > length(route) & grepl('both', pred.type))
-        stop('autocorrelations cannot be larger than length(route) - 1!')
-
     if(!is.null(timebin_rules)){
         timebin_rules = list(
             list(start='6:30', end= '9:00', days = 1:5, tag='MorningRush'),

diff --git a/R/residual_autocorrelation.R b/R/residual_autocorrelation.R
@@ -1,14 +1,15 @@
-#' Calculate mean of auto-correlation
-#' The algorithm only keeps trips ("trips") that have more than 10 ijk links.
-#' On these links, we use a function called "get rho" allowing to calculate the mean of
-#' the auto-correlation of the speed with a lag up to 5 for each trip.
-#' If desired, the function also allows calculation specifically for AM or PM timebins.
+#' Auto-correlation of sequence travel residual
 #'
-#' @param data Train dataset sampled according to time-bin.
-#' @param network_parameters Trips that have more than 10 edges inside the train dataset sampled.
-#' @param lag maximum lag at which to calculate the acf.  Default is 1.
-#' @param nsamples number of random trip to sample for parameter estimation. Default is 500
+#' \code{residual_autocorrelation} Calculates the average autocorrelations of the residual sequence of travel time.
+#'
+#' @param data A data frame of trips and their road level travel information, formatted as \code{trips}, see \code{trips} or \code{data(trips); View(trips)}.
+#' @param network_parameters An output of \code{link_mean_variance}, see \code{?link_mean_variance}.
+#' @param lag Maximum lag at which to calculate the autocorrelations. Default is 1 for the first order-autocorrelations.
+#' @param nsamples The number of trips to sample for parameter estimation. Default is 500.
+#'
+#' @details A residual sequence is a sequences of \code{(duration_sec_i - E[duration_sec_i])/sd(duration_sec_i)} for the \code{i} links of a trip. 
 #' 
+#' @return Returns a data frame of lag order and autocorrelations estimates.
 #' @examples
 #' \dontrun{
 #'

diff --git a/R/residual_variance.R b/R/residual_variance.R
@@ -1,14 +1,17 @@
-#' Calculate mean of auto-correlation
-#' The algorithm only keeps trips ("trips") that have more than 10 ijk links.
-#' On these links, we use a function called "get rho" allowing to calculate the mean of
-#' the auto-correlation of the speed with a lag up to 5 for each trip.
-#' If desired, the function also allows calculation specifically for AM or PM timebins.
+#' Estimates the sample variance of the standardized travel residual
 #'
-#' @param data Train dataset sampled according to time-bin.
-#' @param network_parameters Trips that have more than 10 edges inside the train dataset sampled.
-#' @param lag maximum lag at which to calculate the acf.  Default is 1.
-#' @param nsamples number of random trip to sample for parameter estimation. Default is 500.
+#' \code{residual_variance} estimate the sample variance of standardized travel residual for a set of trips.
 #'
+#' @param data A data frame of trips and their road level travel information, formatted as \code{trips}, see \code{trips} or \code{data(trips); View(trips)}.
+#' @param network_parameters An output of \code{link_mean_variance}, see \code{?link_mean_variance}.
+#' @param lag Maximum lag at which to calculate the autocorrelations. Default is 1 for the first order-autocorrelations.
+#' @param nsamples The number of trips to sample for parameter estimation. Default is 500.
+#' @param ... Extra parameters to be passed to \code{predict.traveltimeCLT.trip_specific}.
+#'
+#' @details The function predicts 'trip-specific' mean and variance of travel time of a sample of trips, given a set of parameter estimates. With such prediction, it estimates the standardized residual and calculates its sample variance. The trip-specific method is a Gaussian-based model, therefore the estimated residual, theoretically, should be 1. Hence, a residual variance of 1.5 resembles \code{sqrt(1.5)-1 = 0.22} of unexplained variability of the model.
+#'
+#' @return Returns the sample variance of the standardized residual.
+#' 
 #' @examples
 #' \dontrun{
 #'
@@ -25,8 +28,9 @@ residual_variance <-function(data, network_parameters, rho = 1L, nsamples=500L,
 
     ## not sure if we need to demain
     V = dt[,predict.traveltimeCLT.trip_specific(entry_time[1], linkID,
-                                                distance_meters, rho = rho,
-                                                network_parameters, ...), by = tripID]
+                                                distance_meters, network_parameters,
+                                                rho = rho,
+                                                ...), by = tripID]
 
     D = merge(dt[,.(dur = sum(duration_secs)),tripID], V)
 

diff --git a/R/sample_trips.R b/R/sample_trips.R
@@ -1,21 +1,21 @@
-#' Sampling a random set of trips from a data set
+#' Sampling a random set of trips from the data
 #'
 #' \code{sample_trips} samples a random set of trips from \code{data}.
 #'
-#' @param data A data frame of trips and their road level travel information, formated as \code{trips}, see \code{trips} or \code{View(data(trips))}.
-#' @param nsamples as scalar representing the number of trips to sample. Default is 100.
-#' @param min.links as minimum number of links in each sampled trip.
+#' @param data A data frame of trips and their road level travel information, formatted as \code{trips}, see \code{trips} or \code{data(trips); View(trips)}.
+#' @param nsamples The number of trips to sample. Default is 100.
+#' @param min.links The minimum number of links in each of the sampled trip.
 #'
 #' @details NULL
 #'
-#' @return \code{sample_trips} return a vector of \code{tripID}'s of the samples trips.
+#' @return Returns a vector of \code{tripID}'s of the sampled trips. Those trips are to removed from the data, it is up to the user to do so.
 #' 
 #' @examples
 #' \dontrun{
 #' data(trips)
-#' # sampling a 100 random trips with minium of 10 links each.
+#' # sampling a 100 random trips with minimum of 10 links each.
 #' index = sample_trips(trips, min.links = 10)
-#' trips[trips$tripID %in% index, ]
+#' trips[trips$tripID %in% index, ]    # sub-setting based on sampled trips
 #' }
 #' @export
 sample_trips <- function(data, nsamples = 100L, min.links= NULL){