README

The goal of thamesblock is to compute the logarithm of the reciprocal marginal likelihood for block models. For example, it can be used on the parametric or nonparametric stochastic block model (SBM).

The reciprocal marginal likelihood is an essential component of Bayesian model choice or model averaging, and one of the core components of Bayes’ rule. In the context of the SBM, for example, it can be used to determine the optimal number of clusters. If the prior on the number of clusters is uniform, this optimal number of components is the number tha maximises the marginal likelihood (i.e., minimises the logarithm of the reciprocal marginal likelihood).

Installation

# install.packages("pak")
pak::pak("m-metodiev/thamesblock")

Example

This is a basic example of determining the optimal number of clusters in a SBM. It works on the following dummy-dataset, which is an adjacency matrix of a graph with 10 nodes and 1 edge.

library(thamesblock)
size_dataset = 10
dataset = matrix(0, nrow=size_dataset, ncol=size_dataset)
dataset[1,2] = 1
dataset[2,1] = 1
print("dataset")
#> [1] "dataset"
dataset
#>       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#>  [1,]    0    1    0    0    0    0    0    0    0     0
#>  [2,]    1    0    0    0    0    0    0    0    0     0
#>  [3,]    0    0    0    0    0    0    0    0    0     0
#>  [4,]    0    0    0    0    0    0    0    0    0     0
#>  [5,]    0    0    0    0    0    0    0    0    0     0
#>  [6,]    0    0    0    0    0    0    0    0    0     0
#>  [7,]    0    0    0    0    0    0    0    0    0     0
#>  [8,]    0    0    0    0    0    0    0    0    0     0
#>  [9,]    0    0    0    0    0    0    0    0    0     0
#> [10,]    0    0    0    0    0    0    0    0    0     0

To calculate the THAMES for the SBM, we need an MCMC sample. Such a sample needs to be computed via a MCMC sampler, e.g., a Gibbs sampler, in practice. For this example, the following sample is provided. Each row corresponds to one draw from the MCMC distribution. Each column corresponds to a datapoint. The entries corresponds to cluster allocations, which each indicate the cluster of a data point. In this example, we want to compute the reciprocal marginal likelihood of the SBM with 2 clusters, so the values of the sample are between 0 (cluster 1) and 2 (cluster 2).

num_clusters = 2 # number of clusters
iterations = 6 # size of the MCMC sample
clustermat = rbind(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0),
c(1, 0, 0, 0, 1, 0, 0, 1, 0, 0))
clustermat
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#> [1,]    0    0    0    0    0    0    0    0    0     0
#> [2,]    0    0    0    0    0    0    0    0    0     0
#> [3,]    0    0    0    0    0    0    0    0    0     0
#> [4,]    0    0    0    0    0    0    0    0    0     0
#> [5,]    0    0    0    0    1    0    0    0    0     0
#> [6,]    1    0    0    0    1    0    0    1    0     0

A function that computes the un-normalised log-posterior values of the MCMC sample (the sum of the log-prior plus the loglikelihood) is given below. It should be noted that the functions implemented in this example are quite slow, and that they should be implemented in Rcpp for optimal performance.



# should be implemented in Rcpp for optimal performance

logprior = function(clustermat){
dirichlet_hyperparameter_vector = rep(1, num_clusters)
logprior_values = numeric(iterations)

for(iter in (1:iterations)){
update_dirichlet_hyperparameter_vector = numeric(num_clusters)
for(cluster_index in 0:(num_clusters-1)){
 update_dirichlet_hyperparameter_vector[cluster_index + 1] =
   dirichlet_hyperparameter_vector[cluster_index + 1] +
   sum(clustermat[iter,] == cluster_index)
 }
 logprior_values[iter] = lgamma(sum(dirichlet_hyperparameter_vector)) +
 sum(lgamma(dirichlet_hyperparameter_vector)) -
 lgamma(sum(update_dirichlet_hyperparameter_vector)) -
 sum(lgamma(update_dirichlet_hyperparameter_vector))
 }
   return(logprior_values)
   }

loglik = function(clustermat){
alpha_hyperparameters = matrix(1, nrow=num_clusters, ncol=num_clusters)
beta_hyperparameters = matrix(1, nrow=num_clusters, ncol=num_clusters)
loglik_values = numeric(iterations)
for(iter in iterations){
updated_alpha_hyperparameters = matrix(nrow=num_clusters, ncol=num_clusters)
updated_beta_hyperparameters = matrix(nrow=num_clusters, ncol=num_clusters)
for(cluster_index_1 in 0:(num_clusters-1)){
 for(cluster_index_2 in 0:(num_clusters-1)){
   updated_alpha_hyperparameters[cluster_index_1 + 1,
                                 cluster_index_2 + 1] =
   alpha_hyperparameters[cluster_index_1 + 1,
                           cluster_index_2 + 1] +
   sum(dataset[which(clustermat[iter,] == cluster_index_1),
               which(clustermat[iter,] == cluster_index_2)])
   updated_beta_hyperparameters[cluster_index_1 + 1,
                                 cluster_index_2 + 1] =
       beta_hyperparameters[cluster_index_1 + 1,
                             cluster_index_2 + 1] +
         sum(1 - dataset[which(clustermat[iter,] == cluster_index_1),
         which(clustermat[iter,] == cluster_index_2)])
   loglik_values[iter] = loglik_values[iter] +
        lgamma(alpha_hyperparameters[cluster_index_1 + 1,
                                     cluster_index_2 + 1] +
                  beta_hyperparameters[cluster_index_1 + 1,
                                       cluster_index_2 + 1]) +
       lgamma(updated_alpha_hyperparameters[cluster_index_1 + 1,
                                           cluster_index_2 + 1]) +
       lgamma(updated_beta_hyperparameters[cluster_index_1 + 1,
                                           cluster_index_2 + 1]) -
       lgamma(updated_alpha_hyperparameters[cluster_index_1 + 1,
                                             cluster_index_2 + 1] +
             updated_beta_hyperparameters[cluster_index_1 + 1,
                                         cluster_index_2 + 1]) -
       lgamma(alpha_hyperparameters[cluster_index_1 + 1,
                                   cluster_index_2 + 1]) -
       lgamma(beta_hyperparameters[cluster_index_1 + 1,
                                   cluster_index_2 + 1])
     }
   }
 }
return(loglik_values)
}
logpost = function(clustermat) logprior(clustermat) + loglik(clustermat)

The THAMES can now be used. As a default, the logarithm of the reciprocal marginal likelihood is given. It can be transformed to the logarithm of the marginal likelihood by multiplication with minus one.