diff --git a/R/compute_sequentially.R b/R/compute_sequentially.R index 4537bc4..cd61a9f 100644 --- a/R/compute_sequentially.R +++ b/R/compute_sequentially.R @@ -1,5 +1,33 @@ #' Compute the Bayesian Mallows model sequentially #' +#' This function implements the nested sequential Monte Carlo (SMC2) algorithm +#' for sequential learning of rank and preference data using the Bayesian +#' Mallows model. The algorithm processes data sequentially over time, +#' maintaining a particle approximation to the posterior distribution. +#' +#' @details +#' The nested SMC2 algorithm consists of two levels of sequential Monte Carlo: +#' +#' **Outer SMC (Parameter Level)**: Maintains particles representing samples +#' from the posterior distribution of static parameters (alpha, rho, tau). +#' Each particle contains its own set of parameter values. +#' +#' **Inner SMC (Latent State Level)**: For each parameter particle, maintains +#' multiple particle filters that track the evolution of latent rankings and +#' cluster assignments over time. This nested structure allows the algorithm +#' to handle the complex dependency structure between parameters and latent +#' states. +#' +#' At each timepoint, the algorithm: +#' 1. Propagates each parameter particle forward using MCMC moves +#' 2. For each parameter particle, runs multiple particle filters to sample +#' latent rankings and cluster assignments +#' 3. Computes importance weights based on the likelihood of observed data +#' 4. Resamples particles when effective sample size drops below threshold +#' 5. Applies rejuvenation moves to maintain particle diversity +#' +#' The nested structure is essential for maintaining proper uncertainty +#' quantification in the joint parameter-latent state space. #' #' @param data A dataframe containing partial rankings or pairwise preferences. #' If `data` contains complete or partial rankings, it must have the following @@ -24,12 +52,93 @@ #' } #' #' @param hyperparameters A list returned from [set_hyperparameters()]. -#' @param smc_options A list returned from [set_smc_options()] +#' @param smc_options A list returned from [set_smc_options()]. Controls the +#' nested SMC2 algorithm parameters including number of parameter particles, +#' number of particle filters per parameter particle, and MCMC move parameters. #' @param topological_sorts A list returned from #' [precompute_topological_sorts()]. Only used with preference data, and -#' defaults to `NULL`. +#' defaults to `NULL`. Contains precomputed topological sorts for efficient +#' sampling from constrained ranking spaces. +#' +#' @return An object of class BayesMallowsSMC2 containing posterior samples +#' and algorithm diagnostics. +#' +#' @references +#' Sørensen, Ø., Stein, A., Netto, W. L., & Leslie, D. S. (2025). +#' Sequential Rank and Preference Learning with the Bayesian Mallows Model. +#' \emph{Bayesian Analysis}. DOI: 10.1214/25-BA1564. +#' +#' @seealso [set_hyperparameters()], [set_smc_options()], [precompute_topological_sorts()] +#' +#' @examples +#' # Example with complete rankings +#' library(BayesMallowsSMC2) +#' +#' # Generate synthetic ranking data +#' set.seed(123) +#' n_items <- 5 +#' n_users <- 20 +#' n_timepoints <- 10 +#' +#' # Create synthetic data +#' data <- expand.grid( +#' timepoint = 1:n_timepoints, +#' user = 1:n_users +#' ) +#' +#' # Add random rankings for each item +#' for(i in 1:n_items) { +#' data[[paste0("item", i)]] <- sample(1:n_items, nrow(data), replace = TRUE) +#' } +#' +#' # Set hyperparameters +#' hyperparams <- set_hyperparameters( +#' n_items = n_items, +#' alpha_shape = 2, +#' alpha_rate = 1, +#' n_clusters = 2 +#' ) +#' +#' # Set SMC options +#' smc_opts <- set_smc_options( +#' n_particles = 100, +#' n_particle_filters = 20, +#' metric = "kendall", +#' verbose = TRUE +#' ) +#' +#' # Run sequential computation +#' result <- compute_sequentially( +#' data = data, +#' hyperparameters = hyperparams, +#' smc_options = smc_opts +#' ) +#' +#' # Example with pairwise preferences +#' # First precompute topological sorts +#' prefs_matrix <- matrix(c(1, 2, 2, 3, 3, 1), ncol = 2, byrow = TRUE) +#' topo_sorts <- precompute_topological_sorts( +#' prefs = prefs_matrix, +#' n_items = 3, +#' save_frac = 0.1 +#' ) +#' +#' # Create preference data +#' pref_data <- data.frame( +#' timepoint = c(1, 1, 2, 2), +#' user = c(1, 2, 1, 2), +#' top_item = c(1, 2, 3, 1), +#' bottom_item = c(2, 3, 1, 3) +#' ) +#' +#' # Run with preferences +#' result_prefs <- compute_sequentially( +#' data = pref_data, +#' hyperparameters = set_hyperparameters(n_items = 3), +#' smc_options = set_smc_options(n_particles = 50), +#' topological_sorts = topo_sorts +#' ) #' -#' @return An object of class BayesMallowsSMC2. #' @export #' compute_sequentially <- function( diff --git a/R/set_hyperparameters.R b/R/set_hyperparameters.R index 9425524..f4d207e 100644 --- a/R/set_hyperparameters.R +++ b/R/set_hyperparameters.R @@ -1,12 +1,83 @@ -#' Set hyperparameters +#' Set hyperparameters for Bayesian Mallows model #' -#' @param n_items Integer defining the number of items. -#' @param alpha_shape Shape of gamma prior for alpha. -#' @param alpha_rate Rate of gamma prior for alpha. -#' @param cluster_concentration Concentration parameter of Dirichlet distribution for cluster probabilities. -#' @param n_clusters Integer defining the number of clusters. +#' Configure prior distributions and model structure for the Bayesian Mallows +#' model used in sequential estimation. This function sets hyperparameters +#' for the precision parameters, modal rankings, and cluster structure. +#' +#' @details +#' The Bayesian Mallows model assumes: +#' +#' **Precision Parameters**: Each cluster k has a precision parameter alpha_k +#' that controls the concentration around the modal ranking. Higher values +#' indicate stronger agreement. The prior is Gamma(alpha_shape, alpha_rate). +#' +#' **Modal Rankings**: Each cluster has a modal ranking rho_k that represents +#' the "consensus" ranking for that cluster. These are sampled uniformly +#' from the space of permutations. +#' +#' **Cluster Probabilities**: The probability of assignment to each cluster +#' follows a Dirichlet distribution with concentration parameter +#' cluster_concentration. +#' +#' **Cluster Structure**: The number of clusters must be specified a priori. +#' Model selection can be performed by comparing models with different +#' numbers of clusters. +#' +#' @param n_items Integer. Number of items being ranked. Must be provided +#' and determines the dimensionality of the ranking space. +#' @param alpha_shape Numeric. Shape parameter of the Gamma prior distribution +#' for precision parameters alpha_k. Higher values concentrate the prior +#' around higher precision values. Default is 1 (exponential prior). +#' @param alpha_rate Numeric. Rate parameter of the Gamma prior distribution +#' for precision parameters alpha_k. Higher values favor lower precision +#' (more dispersed rankings). Default is 0.5. +#' @param cluster_concentration Numeric. Concentration parameter of the +#' Dirichlet prior for cluster assignment probabilities. Higher values +#' favor more equal cluster sizes, while lower values allow more unbalanced +#' clusters. Default is 10. +#' @param n_clusters Integer. Number of mixture components (clusters) in the +#' model. Each cluster has its own modal ranking and precision parameter. +#' Default is 1 (single-cluster model). +#' +#' @return A list containing all hyperparameter values for use in +#' [compute_sequentially()]. +#' +#' @references +#' Sørensen, Ø., Stein, A., Netto, W. L., & Leslie, D. S. (2025). +#' Sequential Rank and Preference Learning with the Bayesian Mallows Model. +#' \emph{Bayesian Analysis}. DOI: 10.1214/25-BA1564. +#' +#' @seealso [compute_sequentially()], [set_smc_options()] +#' +#' @examples +#' # Basic hyperparameters for 5 items, single cluster +#' basic_hyper <- set_hyperparameters(n_items = 5) +#' +#' # Multiple clusters with informative priors +#' multi_cluster <- set_hyperparameters( +#' n_items = 10, +#' alpha_shape = 2, # More concentrated precision prior +#' alpha_rate = 1, # Moderate precision values +#' n_clusters = 3, # Three mixture components +#' cluster_concentration = 5 # Allow unbalanced clusters +#' ) +#' +#' # High-precision scenario (strong agreement expected) +#' high_precision <- set_hyperparameters( +#' n_items = 8, +#' alpha_shape = 5, # Strong prior for high precision +#' alpha_rate = 0.5, # Favor high alpha values +#' n_clusters = 2 +#' ) +#' +#' # Low-precision scenario (weak agreement expected) +#' low_precision <- set_hyperparameters( +#' n_items = 6, +#' alpha_shape = 1, # Weak prior +#' alpha_rate = 2, # Favor low alpha values +#' n_clusters = 1 +#' ) #' -#' @return A list #' @export #' set_hyperparameters <- function( diff --git a/R/set_smc_options.R b/R/set_smc_options.R index 425e7d8..3eea864 100644 --- a/R/set_smc_options.R +++ b/R/set_smc_options.R @@ -1,26 +1,102 @@ -#' Set SMC options -#' -#' @param n_particles Number of particles -#' @param n_particle_filters Initial number of particle filters for each -#' particle -#' @param max_particle_filters Maximum number of particle filters. -#' @param resampling_threshold Effective sample size threshold for resampling -#' @param doubling_threshold Threshold for particle filter doubling. If the -#' acceptance rate of the rejuvenation step falls below this threshold, the -#' number of particle filters is doubled. Defaults to 0.2. -#' @param max_rejuvenation_steps Maximum number of rejuvenation steps. If the -#' number of unique particles has not exceeded half the number of particles -#' after this many steps, the rejuvenation is still stopped. -#' @param metric Metric -#' @param resampler resampler -#' @param latent_rank_proposal latent rank proposal -#' @param verbose Boolean -#' @param trace Logical specifying whether to save static parameters at each -#' timestep. -#' @param trace_latent Logical specifying whether to sample and save one -#' complete set of latent rankings for each particle and each timepoint. -#' -#' @return A list +#' Set SMC options for nested sequential Monte Carlo algorithm +#' +#' Configure parameters for the nested SMC2 algorithm used in sequential +#' Bayesian Mallows model estimation. This function sets both outer-level +#' (parameter particle) and inner-level (latent state particle filter) +#' algorithm parameters. +#' +#' @details +#' The nested SMC2 algorithm requires careful tuning of both levels: +#' +#' **Outer SMC Level**: Controls the number of parameter particles and their +#' rejuvenation through MCMC moves. More particles provide better approximation +#' but increase computational cost. +#' +#' **Inner SMC Level**: Each parameter particle maintains multiple particle +#' filters for latent rankings. More filters improve latent state estimation +#' but multiply computational cost by the number of parameter particles. +#' +#' @param n_particles Integer. Number of parameter particles in the outer SMC +#' sampler. Each particle represents a sample from the posterior distribution +#' of static parameters (alpha, rho, tau). Larger values improve posterior +#' approximation accuracy but increase computational cost linearly. +#' @param n_particle_filters Integer. Initial number of particle filters +#' maintained by each parameter particle for sampling latent rankings and +#' cluster assignments. This creates the nested structure where each of the +#' `n_particles` parameter particles runs `n_particle_filters` inner particle +#' filters. +#' @param max_particle_filters Integer. Maximum number of particle filters +#' allowed per parameter particle. The algorithm may double the number of +#' filters when rejuvenation acceptance rates are low, up to this limit. +#' @param resampling_threshold Numeric. Effective sample size threshold for +#' triggering resampling of parameter particles. When ESS falls below this +#' value, multinomial resampling is performed. Default is `n_particles / 2`. +#' @param doubling_threshold Numeric. Acceptance rate threshold for particle +#' filter doubling during rejuvenation. If the acceptance rate of MCMC +#' rejuvenation moves falls below this threshold, the number of particle +#' filters is doubled to improve mixing. Defaults to 0.2. +#' @param max_rejuvenation_steps Integer. Maximum number of MCMC rejuvenation +#' steps applied to parameter particles. Rejuvenation stops early if the +#' number of unique particles exceeds half the total number of particles, +#' indicating sufficient diversity. +#' @param metric Character. Distance metric for the Mallows model. Options +#' include "kendall", "cayley", "hamming", "footrule", "spearman", and "ulam". +#' Different metrics capture different aspects of ranking disagreement. +#' @param resampler Character. Resampling algorithm for parameter particles. +#' Options include "multinomial", "residual", "stratified", and "systematic". +#' Systematic resampling often provides better performance. +#' @param latent_rank_proposal Character. Proposal mechanism for sampling +#' latent rankings in the inner particle filters. Options include "uniform" +#' and other problem-specific proposals. +#' @param verbose Logical. Whether to print algorithm progress and diagnostics +#' during execution. Useful for monitoring convergence and performance. +#' @param trace Logical. Whether to save static parameter values (alpha, rho, +#' tau) from all particles at each timestep. Enables detailed posterior +#' analysis but increases memory usage. +#' @param trace_latent Logical. Whether to sample and save one complete set +#' of latent rankings for each parameter particle at each timepoint. Provides +#' full posterior samples of latent states but significantly increases memory +#' requirements. +#' +#' @return A list containing all SMC2 algorithm parameters for use in +#' [compute_sequentially()]. +#' +#' @references +#' Sørensen, Ø., Stein, A., Netto, W. L., & Leslie, D. S. (2025). +#' Sequential Rank and Preference Learning with the Bayesian Mallows Model. +#' \emph{Bayesian Analysis}. DOI: 10.1214/25-BA1564. +#' +#' @seealso [compute_sequentially()], [set_hyperparameters()] +#' +#' @examples +#' # Basic SMC options for small problems +#' basic_opts <- set_smc_options( +#' n_particles = 100, +#' n_particle_filters = 20, +#' metric = "kendall" +#' ) +#' +#' # High-precision options for larger problems +#' precise_opts <- set_smc_options( +#' n_particles = 1000, +#' n_particle_filters = 100, +#' max_particle_filters = 500, +#' resampling_threshold = 500, +#' metric = "footrule", +#' resampler = "systematic", +#' verbose = TRUE, +#' trace = TRUE +#' ) +#' +#' # Memory-efficient options +#' efficient_opts <- set_smc_options( +#' n_particles = 200, +#' n_particle_filters = 30, +#' trace = FALSE, +#' trace_latent = FALSE, +#' verbose = FALSE +#' ) +#' #' @export #' set_smc_options <- function( diff --git a/README.Rmd b/README.Rmd index b02b155..c9035e1 100644 --- a/README.Rmd +++ b/README.Rmd @@ -19,7 +19,16 @@ knitr::opts_chunk$set( [![R-CMD-check](https://github.com/osorensen/BayesMallowsSMC2/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/osorensen/BayesMallowsSMC2/actions/workflows/R-CMD-check.yaml) -BayesMallowsSMC2 provides functions for performing sequential inference in the Bayesian Mallows model using the SMC$^{2}$ algorithm. +BayesMallowsSMC2 provides functions for performing sequential inference in the Bayesian Mallows model using the nested sequential Monte Carlo (SMC²) algorithm. This package implements the methodology described in Sørensen et al. (2025) for learning from ranking and preference data that arrives sequentially over time. + +## Key Features + +- **Sequential Learning**: Process ranking and preference data as it arrives over time +- **Nested SMC² Algorithm**: Efficient particle-based inference for complex parameter-latent state dependencies +- **Multiple Data Types**: Support for complete rankings, partial rankings, and pairwise preferences +- **Flexible Distance Metrics**: Kendall, Cayley, Hamming, Footrule, Spearman, and Ulam distances +- **Mixture Models**: Multi-cluster Bayesian Mallows models for heterogeneous populations +- **Real-time Inference**: Online posterior updates without reprocessing historical data ## Installation @@ -30,8 +39,146 @@ You can install the development version of BayesMallowsSMC2 from [GitHub](https: devtools::install_github("osorensen/BayesMallowsSMC2") ``` -## Usage +## Quick Start + +Here's a basic example of sequential ranking analysis: + +```{r example, eval=FALSE} +library(BayesMallowsSMC2) + +# Generate synthetic ranking data +set.seed(123) +n_items <- 5 +n_users <- 20 +n_timepoints <- 10 + +# Create sequential ranking data +data <- expand.grid( + timepoint = 1:n_timepoints, + user = 1:n_users +) + +# Add rankings for each item (1 = most preferred, 5 = least preferred) +for(i in 1:n_items) { + data[[paste0("item", i)]] <- sample(1:n_items, nrow(data), replace = TRUE) +} + +# Set up model parameters +hyperparams <- set_hyperparameters( + n_items = n_items, + n_clusters = 2, # Two preference groups + alpha_shape = 2, # Precision prior + alpha_rate = 1 +) + +# Configure SMC² algorithm +smc_opts <- set_smc_options( + n_particles = 500, # Parameter particles + n_particle_filters = 100, # Latent state filters per particle + metric = "kendall", # Distance metric + verbose = TRUE +) + +# Run sequential inference +result <- compute_sequentially( + data = data, + hyperparameters = hyperparams, + smc_options = smc_opts +) + +# Analyze results +summary(result) +plot(result) +``` + +## Algorithm Overview + +The nested SMC² algorithm operates on two levels: + +1. **Outer SMC (Parameter Level)**: Maintains particles representing samples from the posterior distribution of static parameters (precision α, modal rankings ρ, cluster probabilities τ) + +2. **Inner SMC (Latent State Level)**: For each parameter particle, runs multiple particle filters to track the evolution of latent rankings and cluster assignments over time + +This nested structure enables efficient inference in the complex joint parameter-latent state space while maintaining proper uncertainty quantification. + +## Data Formats + +### Complete Rankings +```{r data-complete, eval=FALSE} +data <- data.frame( + timepoint = c(1, 1, 2, 2), + user = c(1, 2, 1, 2), + item1 = c(1, 2, 1, 3), # Rankings for item 1 + item2 = c(2, 1, 2, 1), # Rankings for item 2 + item3 = c(3, 3, 3, 2) # Rankings for item 3 +) +``` + +### Pairwise Preferences +```{r data-preferences, eval=FALSE} +# First precompute topological sorts +prefs_matrix <- matrix(c(1, 2, 2, 3, 3, 1), ncol = 2, byrow = TRUE) +topo_sorts <- precompute_topological_sorts( + prefs = prefs_matrix, + n_items = 3, + save_frac = 0.1 +) + +# Create preference data +pref_data <- data.frame( + timepoint = c(1, 1, 2, 2), + user = c(1, 2, 1, 2), + top_item = c(1, 2, 3, 1), # Preferred item + bottom_item = c(2, 3, 1, 3) # Dispreferred item +) + +# Run analysis +result <- compute_sequentially( + data = pref_data, + hyperparameters = set_hyperparameters(n_items = 3), + smc_options = set_smc_options(n_particles = 200), + topological_sorts = topo_sorts +) +``` + +## Performance Tuning + +The algorithm's performance depends on several key parameters: + +- **n_particles**: More particles improve accuracy but increase computational cost +- **n_particle_filters**: More filters per particle improve latent state estimation +- **resampling_threshold**: Controls when to resample particles (default: n_particles/2) +- **metric**: Different distance metrics capture different aspects of ranking disagreement + +For large-scale problems, consider: +```{r tuning, eval=FALSE} +# High-performance configuration +smc_opts <- set_smc_options( + n_particles = 1000, + n_particle_filters = 200, + max_particle_filters = 1000, + resampler = "systematic", + trace = FALSE, # Reduce memory usage + trace_latent = FALSE +) +``` + +## Citation + +If you use this package in your research, please cite: + +> Sørensen, Ø., Stein, A., Netto, W. L., & Leslie, D. S. (2025). Sequential Rank and Preference Learning with the Bayesian Mallows Model. *Bayesian Analysis*. DOI: 10.1214/25-BA1564. + +## Additional Resources + +- **Paper**: The foundational methodology paper with theoretical details and empirical studies +- **OSF Repository**: https://osf.io/pquk4/ - Contains replication code and additional examples +- **Vignettes**: Detailed tutorials and case studies (coming soon) + +## Getting Help -This package is under development, and is not yet well documented. For examples on how to use it, see the code in the OSF repository https://osf.io/pquk4/. +- **Issues**: Report bugs and request features on [GitHub Issues](https://github.com/osorensen/BayesMallowsSMC2/issues) +- **Documentation**: Use `?function_name` for detailed help on specific functions +- **Examples**: See function documentation for comprehensive examples