diff --git a/DESCRIPTION b/DESCRIPTION index edcc18c..51e137e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,7 +21,9 @@ LinkingTo: Rcpp, RcppArmadillo Imports: - Rcpp + Rcpp, + Rdpack +RdMacros: Rdpack Depends: R (>= 4.1.0) Suggests: diff --git a/R/compute_sequentially.R b/R/compute_sequentially.R index 14123fc..4ae8b92 100644 --- a/R/compute_sequentially.R +++ b/R/compute_sequentially.R @@ -1,35 +1,72 @@ #' Compute the Bayesian Mallows model sequentially #' +#' This function performs sequential Bayesian inference for the Mallows model +#' using the SMC² (Sequential Monte Carlo squared) algorithm. It can handle +#' both complete/partial rankings and pairwise preference data that arrive +#' sequentially over time. #' -#' @param data A dataframe containing partial rankings or pairwise preferences. -#' If `data` contains complete or partial rankings, it must have the following -#' columns: +#' @param data A data frame containing ranking or preference data with temporal +#' structure. The data frame must include `timepoint` and `user` columns. #' -#' \itemize{ -#' \item `timepoint`: a numeric vector denoting the timepoint, starting at 1. -#' \item `user`: a vector identifying the user. -#' \item `item1`: ranking of item 1. -#' \item `item2`: ranking of item 2. -#' \item etc. -#' } +#' For complete or partial rankings, additional columns should be: +#' \itemize{ +#' \item `timepoint`: Numeric vector denoting the timepoint, starting at 1. +#' \item `user`: Vector identifying the user providing the ranking. +#' \item `item1`, `item2`, etc.: Rankings of items (use NA for missing items +#' in partial rankings). +#' } #' -#' If data contains pairwise preferences, it must have the following -#' structure: +#' For pairwise preferences, the structure should be: +#' \itemize{ +#' \item `timepoint`: Numeric vector denoting the timepoint, starting at 1. +#' \item `user`: Vector identifying the user providing the preference. +#' \item `top_item`: Identifier for the preferred item. +#' \item `bottom_item`: Identifier for the less preferred item. +#' } #' -#' \itemize{ -#' \item `timepoint`: a numeric vector denoting the timepoint, starting at 1. -#' \item `user`: a vector identifying the user. -#' \item `top_item`: identifier for the preferred item. -#' \item `bottom_item`: identifier for the dispreferred item. -#' } -#' -#' @param hyperparameters A list returned from [set_hyperparameters()]. -#' @param smc_options A list returned from [set_smc_options()] +#' @param hyperparameters A list of hyperparameters returned from +#' \code{\link{set_hyperparameters}}. Defines the prior distributions for +#' model parameters. +#' @param smc_options A list of SMC algorithm options returned from +#' \code{\link{set_smc_options}}. Controls the behavior of the particle +#' filtering algorithm. #' @param topological_sorts A list returned from -#' [precompute_topological_sorts()]. Only used with preference data, and -#' defaults to `NULL`. +#' \code{\link{precompute_topological_sorts}}. Required when using pairwise +#' preference data, otherwise should be \code{NULL} (default). +#' +#' @return An object of class \code{BayesMallowsSMC2} containing the results +#' of the sequential inference, including parameter traces, log marginal +#' likelihood estimates, and other algorithm diagnostics. +#' +#' @examples +#' # Example with complete rankings +#' set.seed(123) +#' n_items <- 4 +#' +#' # Create synthetic ranking data +#' ranking_data <- data.frame( +#' timepoint = c(1, 1, 2, 2), +#' user = c(1, 2, 3, 4), +#' item1 = c(1, 2, 1, 3), +#' item2 = c(2, 1, 3, 1), +#' item3 = c(3, 4, 2, 4), +#' item4 = c(4, 3, 4, 2) +#' ) +#' +#' # Set up hyperparameters and options +#' hyper <- set_hyperparameters(n_items = n_items) +#' opts <- set_smc_options(n_particles = 100, verbose = FALSE) +#' +#' # Run sequential inference +#' result <- compute_sequentially( +#' data = ranking_data, +#' hyperparameters = hyper, +#' smc_options = opts +#' ) +#' +#' @references +#' \insertRef{sorensen2025sequential}{BayesMallowsSMC2} #' -#' @return An object of class BayesMallowsSMC2. #' @export #' compute_sequentially <- function( @@ -76,7 +113,7 @@ compute_sequentially <- function( user_timepoint_combinations <- unique(data[c("user", "timepoint")]) if(max(table(user_timepoint_combinations$user)) > 1) { stop("Each user can only enter the pool once. Users appearing at multiple timepoints: ", - paste(names(table(user_timepoint_combinations$user))[table(user_timepoint_combinations$user) > 1], + paste(names(table(user_timepoint_combinations$user))[table(user_timepoint_combinations$user) > 1], collapse = ", ")) } diff --git a/R/set_hyperparameters.R b/R/set_hyperparameters.R index 9425524..ea53422 100644 --- a/R/set_hyperparameters.R +++ b/R/set_hyperparameters.R @@ -1,12 +1,41 @@ -#' Set hyperparameters +#' Set hyperparameters for Bayesian Mallows model #' -#' @param n_items Integer defining the number of items. -#' @param alpha_shape Shape of gamma prior for alpha. -#' @param alpha_rate Rate of gamma prior for alpha. -#' @param cluster_concentration Concentration parameter of Dirichlet distribution for cluster probabilities. -#' @param n_clusters Integer defining the number of clusters. +#' This function creates a list of hyperparameters for the Bayesian Mallows model +#' used in sequential Monte Carlo inference. The hyperparameters define the prior +#' distributions for the model parameters. #' -#' @return A list +#' @param n_items Integer defining the number of items to be ranked. Must be a +#' positive integer. +#' @param alpha_shape Positive numeric value specifying the shape parameter of +#' the gamma prior distribution for the scale parameter alpha. Default is 1. +#' @param alpha_rate Positive numeric value specifying the rate parameter of +#' the gamma prior distribution for the scale parameter alpha. Default is 0.5. +#' @param cluster_concentration Positive numeric value specifying the +#' concentration parameter of the Dirichlet distribution for cluster +#' probabilities. Default is 10. +#' @param n_clusters Positive integer defining the number of clusters in the +#' mixture model. Default is 1. +#' +#' @return A list containing the hyperparameter values with elements: +#' \item{n_items}{Number of items} +#' \item{alpha_shape}{Shape parameter for alpha prior} +#' \item{alpha_rate}{Rate parameter for alpha prior} +#' \item{cluster_concentration}{Concentration parameter for cluster probabilities} +#' \item{n_clusters}{Number of clusters} +#' +#' @examples +#' # Basic hyperparameters for 5 items +#' hyper <- set_hyperparameters(n_items = 5) +#' +#' # Custom hyperparameters with multiple clusters +#' hyper <- set_hyperparameters( +#' n_items = 10, +#' alpha_shape = 2, +#' alpha_rate = 1, +#' cluster_concentration = 5, +#' n_clusters = 3 +#' ) +#' #' @export #' set_hyperparameters <- function( diff --git a/R/set_smc_options.R b/R/set_smc_options.R index 425e7d8..01ebcda 100644 --- a/R/set_smc_options.R +++ b/R/set_smc_options.R @@ -1,26 +1,62 @@ -#' Set SMC options +#' Set SMC options for sequential inference #' -#' @param n_particles Number of particles -#' @param n_particle_filters Initial number of particle filters for each -#' particle -#' @param max_particle_filters Maximum number of particle filters. -#' @param resampling_threshold Effective sample size threshold for resampling -#' @param doubling_threshold Threshold for particle filter doubling. If the -#' acceptance rate of the rejuvenation step falls below this threshold, the -#' number of particle filters is doubled. Defaults to 0.2. -#' @param max_rejuvenation_steps Maximum number of rejuvenation steps. If the -#' number of unique particles has not exceeded half the number of particles -#' after this many steps, the rejuvenation is still stopped. -#' @param metric Metric -#' @param resampler resampler -#' @param latent_rank_proposal latent rank proposal -#' @param verbose Boolean -#' @param trace Logical specifying whether to save static parameters at each -#' timestep. -#' @param trace_latent Logical specifying whether to sample and save one -#' complete set of latent rankings for each particle and each timepoint. +#' This function creates a list of options for the Sequential Monte Carlo (SMC²) +#' algorithm used in Bayesian inference for the Mallows model. These options +#' control the behavior of the particle filtering and resampling procedures. #' -#' @return A list +#' @param n_particles Positive integer specifying the number of particles to use +#' in the SMC algorithm. Default is 1000. +#' @param n_particle_filters Positive integer specifying the initial number of +#' particle filters for each particle. Default is 50. +#' @param max_particle_filters Positive integer specifying the maximum number +#' of particle filters allowed. Default is 10000. +#' @param resampling_threshold Positive numeric value specifying the effective +#' sample size threshold for triggering resampling. Default is n_particles/2. +#' @param doubling_threshold Numeric value between 0 and 1 specifying the +#' threshold for particle filter doubling. If the acceptance rate of the +#' rejuvenation step falls below this threshold, the number of particle +#' filters is doubled. Default is 0.2. +#' @param max_rejuvenation_steps Positive integer specifying the maximum number +#' of rejuvenation steps. If the number of unique particles has not exceeded +#' half the number of particles after this many steps, the rejuvenation is +#' stopped. Default is 20. +#' @param metric Character string specifying the distance metric to use. +#' Options include "footrule", "kendall", "spearman", "cayley", "hamming", +#' and "ulam". Default is "footrule". +#' @param resampler Character string specifying the resampling method. +#' Options include "multinomial", "residual", "stratified", and "systematic". +#' Default is "multinomial". +#' @param latent_rank_proposal Character string specifying the proposal +#' distribution for latent rankings. Default is "uniform". +#' @param verbose Logical value indicating whether to print progress messages +#' during computation. Default is FALSE. +#' @param trace Logical value specifying whether to save static parameters at +#' each timestep. Default is FALSE. +#' @param trace_latent Logical value specifying whether to sample and save one +#' complete set of latent rankings for each particle and each timepoint. +#' Default is FALSE. +#' +#' @return A list containing all the SMC options with the specified values. +#' +#' @examples +#' # Default SMC options +#' opts <- set_smc_options() +#' +#' # Custom SMC options with fewer particles and Kendall distance +#' opts <- set_smc_options( +#' n_particles = 500, +#' n_particle_filters = 25, +#' metric = "kendall", +#' verbose = TRUE +#' ) +#' +#' # Options for tracing parameters +#' opts <- set_smc_options( +#' n_particles = 100, +#' trace = TRUE, +#' trace_latent = TRUE +#' ) +#' #' @export #' set_smc_options <- function( diff --git a/README.Rmd b/README.Rmd index b02b155..1171598 100644 --- a/README.Rmd +++ b/README.Rmd @@ -1,5 +1,6 @@ --- output: github_document +bibliography: inst/REFERENCES.bib --- @@ -19,7 +20,9 @@ knitr::opts_chunk$set( [![R-CMD-check](https://github.com/osorensen/BayesMallowsSMC2/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/osorensen/BayesMallowsSMC2/actions/workflows/R-CMD-check.yaml) -BayesMallowsSMC2 provides functions for performing sequential inference in the Bayesian Mallows model using the SMC$^{2}$ algorithm. +BayesMallowsSMC2 provides functions for performing sequential inference in the Bayesian Mallows model using the SMC$^{2}$ algorithm. The package implements the methods described in @sorensen2025sequential. + +The Bayesian Mallows model is a probabilistic framework for analyzing ranking data, and this package extends it to handle sequential learning scenarios where rankings arrive over time. The SMC$^{2}$ (Sequential Monte Carlo squared) algorithm enables efficient Bayesian inference by combining particle filtering with MCMC methods. ## Installation @@ -32,6 +35,17 @@ devtools::install_github("osorensen/BayesMallowsSMC2") ## Usage -This package is under development, and is not yet well documented. For examples on how to use it, see the code in the OSF repository https://osf.io/pquk4/. +This package implements sequential Bayesian inference for ranking data using the Mallows model. The main function is `compute_sequentially()`, which performs SMC$^{2}$ inference as rankings arrive over time. + +```r +library(BayesMallowsSMC2) + +# Example usage (see vignettes for detailed examples) +# result <- compute_sequentially(data, hyperparameters, smc_options) +``` + +For detailed examples and reproducible code, see the OSF repository at https://osf.io/pquk4/. + +## References diff --git a/README.md b/README.md index 935ceb4..6070770 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,15 @@ BayesMallowsSMC2 provides functions for performing sequential inference -in the Bayesian Mallows model using the SMC$^{2}$ algorithm. +in the Bayesian Mallows model using the SMC$^{2}$ algorithm. The package +implements the methods described in Sørensen, Frigessi, and Scheel +(2025). + +The Bayesian Mallows model is a probabilistic framework for analyzing +ranking data, and this package extends it to handle sequential learning +scenarios where rankings arrive over time. The SMC$^{2}$ (Sequential +Monte Carlo squared) algorithm enables efficient Bayesian inference by +combining particle filtering with MCMC methods. ## Installation @@ -23,6 +31,31 @@ devtools::install_github("osorensen/BayesMallowsSMC2") ## Usage -This package is under development, and is not yet well documented. For -examples on how to use it, see the code in the OSF repository +This package implements sequential Bayesian inference for ranking data +using the Mallows model. The main function is `compute_sequentially()`, +which performs SMC$^{2}$ inference as rankings arrive over time. + +``` r +library(BayesMallowsSMC2) + +# Example usage (see vignettes for detailed examples) +# result <- compute_sequentially(data, hyperparameters, smc_options) +``` + +For detailed examples and reproducible code, see the OSF repository at . + +## References + +
+ +
+ +Sørensen, Øystein, Arnoldo Frigessi, and Ida Scheel. 2025. “Sequential +Rank and Preference Learning with the Bayesian Mallows Model.” *Bayesian +Analysis*. . + +
+ +
diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib new file mode 100644 index 0000000..46f9118 --- /dev/null +++ b/inst/REFERENCES.bib @@ -0,0 +1,9 @@ +@article{sorensen2025sequential, + title={Sequential Rank and Preference Learning with the Bayesian Mallows Model}, + author={S{\o}rensen, {\O}ystein and Frigessi, Arnoldo and Scheel, Ida}, + journal={Bayesian Analysis}, + year={2025}, + publisher={International Society for Bayesian Analysis}, + doi={10.1214/25-BA1564}, + url={https://projecteuclid.org/journals/bayesian-analysis/advance-publication/Sequential-Rank-and-Preference-Learning-with-the-Bayesian-Mallows-Model/10.1214/25-BA1564.full} +} diff --git a/man/compute_sequentially.Rd b/man/compute_sequentially.Rd index 610a73a..645ad50 100644 --- a/man/compute_sequentially.Rd +++ b/man/compute_sequentially.Rd @@ -12,39 +12,75 @@ compute_sequentially( ) } \arguments{ -\item{data}{A dataframe containing partial rankings or pairwise preferences. -If \code{data} contains complete or partial rankings, it must have the following -columns: +\item{data}{A data frame containing ranking or preference data with temporal +structure. The data frame must include \code{timepoint} and \code{user} columns. +For complete or partial rankings, additional columns should be: \itemize{ -\item \code{timepoint}: a numeric vector denoting the timepoint, starting at 1. -\item \code{user}: a vector identifying the user. -\item \code{item1}: ranking of item 1. -\item \code{item2}: ranking of item 2. -\item etc. +\item \code{timepoint}: Numeric vector denoting the timepoint, starting at 1. +\item \code{user}: Vector identifying the user providing the ranking. +\item \code{item1}, \code{item2}, etc.: Rankings of items (use NA for missing items +in partial rankings). } -If data contains pairwise preferences, it must have the following -structure: - +For pairwise preferences, the structure should be: \itemize{ -\item \code{timepoint}: a numeric vector denoting the timepoint, starting at 1. -\item \code{user}: a vector identifying the user. -\item \code{top_item}: identifier for the preferred item. -\item \code{bottom_item}: identifier for the dispreferred item. +\item \code{timepoint}: Numeric vector denoting the timepoint, starting at 1. +\item \code{user}: Vector identifying the user providing the preference. +\item \code{top_item}: Identifier for the preferred item. +\item \code{bottom_item}: Identifier for the less preferred item. }} -\item{hyperparameters}{A list returned from \code{\link[=set_hyperparameters]{set_hyperparameters()}}.} +\item{hyperparameters}{A list of hyperparameters returned from +\code{\link{set_hyperparameters}}. Defines the prior distributions for +model parameters.} -\item{smc_options}{A list returned from \code{\link[=set_smc_options]{set_smc_options()}}} +\item{smc_options}{A list of SMC algorithm options returned from +\code{\link{set_smc_options}}. Controls the behavior of the particle +filtering algorithm.} \item{topological_sorts}{A list returned from -\code{\link[=precompute_topological_sorts]{precompute_topological_sorts()}}. Only used with preference data, and -defaults to \code{NULL}.} +\code{\link{precompute_topological_sorts}}. Required when using pairwise +preference data, otherwise should be \code{NULL} (default).} } \value{ -An object of class BayesMallowsSMC2. +An object of class \code{BayesMallowsSMC2} containing the results +of the sequential inference, including parameter traces, log marginal +likelihood estimates, and other algorithm diagnostics. } \description{ -Compute the Bayesian Mallows model sequentially +This function performs sequential Bayesian inference for the Mallows model +using the SMC² (Sequential Monte Carlo squared) algorithm. It can handle +both complete/partial rankings and pairwise preference data that arrive +sequentially over time. +} +\examples{ +# Example with complete rankings +set.seed(123) +n_items <- 4 + +# Create synthetic ranking data +ranking_data <- data.frame( + timepoint = c(1, 1, 2, 2), + user = c(1, 2, 3, 4), + item1 = c(1, 2, 1, 3), + item2 = c(2, 1, 3, 1), + item3 = c(3, 4, 2, 4), + item4 = c(4, 3, 4, 2) +) + +# Set up hyperparameters and options +hyper <- set_hyperparameters(n_items = n_items) +opts <- set_smc_options(n_particles = 100, verbose = FALSE) + +# Run sequential inference +result <- compute_sequentially( + data = ranking_data, + hyperparameters = hyper, + smc_options = opts +) + +} +\references{ +\insertRef{sorensen2025sequential}{BayesMallowsSMC2} } diff --git a/man/set_hyperparameters.Rd b/man/set_hyperparameters.Rd index 79c85bc..6193507 100644 --- a/man/set_hyperparameters.Rd +++ b/man/set_hyperparameters.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/set_hyperparameters.R \name{set_hyperparameters} \alias{set_hyperparameters} -\title{Set hyperparameters} +\title{Set hyperparameters for Bayesian Mallows model} \usage{ set_hyperparameters( n_items, @@ -13,19 +13,46 @@ set_hyperparameters( ) } \arguments{ -\item{n_items}{Integer defining the number of items.} +\item{n_items}{Integer defining the number of items to be ranked. Must be a +positive integer.} -\item{alpha_shape}{Shape of gamma prior for alpha.} +\item{alpha_shape}{Positive numeric value specifying the shape parameter of +the gamma prior distribution for the scale parameter alpha. Default is 1.} -\item{alpha_rate}{Rate of gamma prior for alpha.} +\item{alpha_rate}{Positive numeric value specifying the rate parameter of +the gamma prior distribution for the scale parameter alpha. Default is 0.5.} -\item{cluster_concentration}{Concentration parameter of Dirichlet distribution for cluster probabilities.} +\item{cluster_concentration}{Positive numeric value specifying the +concentration parameter of the Dirichlet distribution for cluster +probabilities. Default is 10.} -\item{n_clusters}{Integer defining the number of clusters.} +\item{n_clusters}{Positive integer defining the number of clusters in the +mixture model. Default is 1.} } \value{ -A list +A list containing the hyperparameter values with elements: +\item{n_items}{Number of items} +\item{alpha_shape}{Shape parameter for alpha prior} +\item{alpha_rate}{Rate parameter for alpha prior} +\item{cluster_concentration}{Concentration parameter for cluster probabilities} +\item{n_clusters}{Number of clusters} } \description{ -Set hyperparameters +This function creates a list of hyperparameters for the Bayesian Mallows model +used in sequential Monte Carlo inference. The hyperparameters define the prior +distributions for the model parameters. +} +\examples{ +# Basic hyperparameters for 5 items +hyper <- set_hyperparameters(n_items = 5) + +# Custom hyperparameters with multiple clusters +hyper <- set_hyperparameters( + n_items = 10, + alpha_shape = 2, + alpha_rate = 1, + cluster_concentration = 5, + n_clusters = 3 +) + } diff --git a/man/set_smc_options.Rd b/man/set_smc_options.Rd index 9eeb81f..0ccb91e 100644 --- a/man/set_smc_options.Rd +++ b/man/set_smc_options.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/set_smc_options.R \name{set_smc_options} \alias{set_smc_options} -\title{Set SMC options} +\title{Set SMC options for sequential inference} \usage{ set_smc_options( n_particles = 1000, @@ -20,40 +20,74 @@ set_smc_options( ) } \arguments{ -\item{n_particles}{Number of particles} +\item{n_particles}{Positive integer specifying the number of particles to use +in the SMC algorithm. Default is 1000.} -\item{n_particle_filters}{Initial number of particle filters for each -particle} +\item{n_particle_filters}{Positive integer specifying the initial number of +particle filters for each particle. Default is 50.} -\item{max_particle_filters}{Maximum number of particle filters.} +\item{max_particle_filters}{Positive integer specifying the maximum number +of particle filters allowed. Default is 10000.} -\item{resampling_threshold}{Effective sample size threshold for resampling} +\item{resampling_threshold}{Positive numeric value specifying the effective +sample size threshold for triggering resampling. Default is n_particles/2.} -\item{doubling_threshold}{Threshold for particle filter doubling. If the -acceptance rate of the rejuvenation step falls below this threshold, the -number of particle filters is doubled. Defaults to 0.2.} +\item{doubling_threshold}{Numeric value between 0 and 1 specifying the +threshold for particle filter doubling. If the acceptance rate of the +rejuvenation step falls below this threshold, the number of particle +filters is doubled. Default is 0.2.} -\item{max_rejuvenation_steps}{Maximum number of rejuvenation steps. If the -number of unique particles has not exceeded half the number of particles -after this many steps, the rejuvenation is still stopped.} +\item{max_rejuvenation_steps}{Positive integer specifying the maximum number +of rejuvenation steps. If the number of unique particles has not exceeded +half the number of particles after this many steps, the rejuvenation is +stopped. Default is 20.} -\item{metric}{Metric} +\item{metric}{Character string specifying the distance metric to use. +Options include "footrule", "kendall", "spearman", "cayley", "hamming", +and "ulam". Default is "footrule".} -\item{resampler}{resampler} +\item{resampler}{Character string specifying the resampling method. +Options include "multinomial", "residual", "stratified", and "systematic". +Default is "multinomial".} -\item{latent_rank_proposal}{latent rank proposal} +\item{latent_rank_proposal}{Character string specifying the proposal +distribution for latent rankings. Default is "uniform".} -\item{verbose}{Boolean} +\item{verbose}{Logical value indicating whether to print progress messages +during computation. Default is FALSE.} -\item{trace}{Logical specifying whether to save static parameters at each -timestep.} +\item{trace}{Logical value specifying whether to save static parameters at +each timestep. Default is FALSE.} -\item{trace_latent}{Logical specifying whether to sample and save one -complete set of latent rankings for each particle and each timepoint.} +\item{trace_latent}{Logical value specifying whether to sample and save one +complete set of latent rankings for each particle and each timepoint. +Default is FALSE.} } \value{ -A list +A list containing all the SMC options with the specified values. } \description{ -Set SMC options +This function creates a list of options for the Sequential Monte Carlo (SMC²) +algorithm used in Bayesian inference for the Mallows model. These options +control the behavior of the particle filtering and resampling procedures. +} +\examples{ +# Default SMC options +opts <- set_smc_options() + +# Custom SMC options with fewer particles and Kendall distance +opts <- set_smc_options( + n_particles = 500, + n_particle_filters = 25, + metric = "kendall", + verbose = TRUE +) + +# Options for tracing parameters +opts <- set_smc_options( + n_particles = 100, + trace = TRUE, + trace_latent = TRUE +) + }