js.dist | R Documentation |
The Jensen-Shannon distance is a method to measure the distance between discrete probability distributions. To measure the distance between 2 high-dimensional datasets, we cut the space into sub-cubes, then count the number of events per cube. The resulting probability distributions can be compared using the Jensen-Shannon distance.
js.dist(mat, pc = 1e-04)
mat |
a matrix of counts, where rows correspond to samples and columns to Hilbert index |
pc |
a pseudo-count that is added to all samples to avoid divide-by-zero errors |
a S3 distance object
Yann Abraham
# generate 3 samples over 5 dimensions # sample 1 and 2 are similar, sample 3 has an extra population # set the seed for reproducible examples set.seed(1234) my.samples <- lapply(LETTERS[1:3],function(j) { # each sample has a different number of events n <- floor(runif(1,0.5,0.8)*10000) # matrix is random normal over 5 dimensions cur.mat <- matrix(rnorm(5*n),ncol=5) # rescale cur.mat to a [0,3] interval cur.mat <- 3*(cur.mat-min(cur.mat))/diff(range(cur.mat)) dimnames(cur.mat)[[2]] <- LETTERS[(length(LETTERS)-4):length(LETTERS)] if(j=='C') { # select 30% of the points cur.rws <- sample(n,round(n*0.3,0)) # select 2 columns at random cur.cls <- sample(ncol(cur.mat),2) # create an artificial sub population cur.mat[cur.rws,cur.cls] <- 4*cur.mat[cur.rws,cur.cls] } return(cur.mat) } ) names(my.samples) <- LETTERS[1:3] # check the population size lapply(my.samples,nrow) # assemble a sample matrix my.samples.mat <- do.call('rbind',my.samples) my.samples.id <- lapply(names(my.samples), function(cur.spl) rep(cur.spl,nrow(my.samples[[cur.spl]]))) my.samples.id <- unlist(my.samples.id) # Estimate the maximum required Hilbert order hilbert.order(my.samples.mat) # Estimate the cut positions my.cuts <- make.cut(my.samples.mat,n=5,count.lim=5) # Visualize the cuts show.cut(my.cuts) # Cut the matrix & compute the hilbert index my.samples.cut <- do.cut(my.samples.mat,my.cuts,type='combined') system.time(my.samples.index <- do.hilbert(my.samples.cut,horder=4)) # Visualize samples as density plots my.samples.dens <- density(my.samples.index) my.samples.dens$y <- (my.samples.dens$y-min(my.samples.dens$y))/diff(range(my.samples.dens$y)) plot(my.samples.dens,col='grey3',lty=2) ksink <- lapply(names(my.samples),function(cur.spl) { cat(cur.spl,'\n') cur.dens <- density(my.samples.index[my.samples.id==cur.spl], bw=my.samples.dens$bw) cur.dens$y <- (cur.dens$y-min(cur.dens$y))/diff(range(cur.dens$y)) lines(cur.dens$x, cur.dens$y, col=match(cur.spl,names(my.samples))+1) } ) legend('topright', legend=names(my.samples), co=seq(length(my.samples))+1, pch=16, bty='n' ) # assemble a contingency table my.samples.table <- table(my.samples.index,my.samples.id) dim(my.samples.table) heatmap(log10(my.samples.table+0.00001), col=colorRampPalette(c('white',blues9))(24), Rowv=NA,Colv=NA, scale='none') # compute the Jensen-Shannon distance my.samples.dist <- js.dist(t(my.samples.table)) my.samples.clust <- hclust(my.samples.dist) plot(my.samples.clust)