Hierarchical Clustering of random sample of segments (with repetitions)

knitr::opts_chunk$set(warning=FALSE, message=FALSE, fig.height=6, fig.width=8,
                      fig.path = "figure/hclustrep_")

require(rgdal)
require(raster)
require(plyr)
require(scales)

dirvec <- "/media/alobo/LaCie2T1/FCConesa/Sahara2/SaharaOTBseg/sub1"
dirdata <- "/media/alobo/LaCie2T1/FCConesa/Sahara2/sahara_datatest"
#dirvec <- "/Volumes/LaCie2T1/FCConesa/Sahara2/SaharaOTBseg/sub1"
# dirdata <- "/Volumes/LaCie2T1/FCConesa/Sahara2/sahara_datatest"

1. Read Data

#Already done in hclust_log.R
# segMSPCA <- readOGR(dsn=dirvec, layer="WV3sub1PCAMSS_r100_s5_R5_S5_m0_m20",
#                     stringsAsFactors = FALSE)
# names(segMSPCA@data) 
# #we get rid of the left vertical 
# segMSPCA@data[segMSPCA@data$label==1,]
# plot(segMSPCA[segMSPCA@data$label==1,])
# #plot(segMSPCA[segMSPCA@data$label!=1,])
# segMSPCA <- segMSPCA[segMSPCA@data$label!=1,]

2. Random sample

set.seed(1000)
rind <- sample(1:nrow(segMSPCA@data),5000)
#to take segment sizes into account
medsize <- sum(segMSPCA@data$nbPixels)/nrow(segMSPCA@data)
#probaility of each segment is proportional to its size in pixels
prob <- segMSPCA@data$nbPixels/sum(segMSPCA@data$nbPixels)
sum(5000*prob)
## [1] 5000
sum(round(5000*prob))
## [1] 816
sum(round(10000*prob))
## [1] 2033
#nsamples in each segment according to its probability
#(which is proportional to its size)
nsamples <- round(10000*prob)

Repeat each column according to the value of nsamples

segMSPCArndr <- segMSPCA[rep(seq_len(nrow(segMSPCA)),nsamples),]
dim(segMSPCArndr)
#Add a rnd nb within the conf interval of the mean to avoid
#identical individuals
# for(i in 3:5) {
#   segMSPCArndr@data[,i] <- segMSPCArndr@data[,i] +
#     runif(nrow(segMSPCArndr@data),
#           -sqrt(segMSPCArndr@data[,i+3]/segMSPCArndr@data[,2]),
#           sqrt(segMSPCArndr@data[,i+3]/segMSPCArndr@data[,2]))
# }
#The above produces "individuals" that are too similar
#Add a rnd nb within the  0.5*SD of the mean to avoid identical individuals
for(i in 3:5) {
  segMSPCArndr@data[,i] <- segMSPCArndr@data[,i] +
    runif(nrow(segMSPCArndr@data),
          -sqrt(segMSPCArndr@data[,i+3])/2,
          sqrt(segMSPCArndr@data[,i+3])/2)
}

check rows with nsamples>0 are not identical

which(nsamples>10)
##  [1]  16327  24059  24271  26500  31108  38121  40811  42718  45791  58615
## [11]  79340  91427 103327 117386 128443 156531 165480 170241 183168 186100
## [21] 200983 211158 223458 238074
#select 1
segMSPCA@data$label[16327]
## [1] 260262
segMSPCArndr@data[segMSPCArndr@data$label==260262,1:5]
##           label nbPixels  meanPC1  meanPC2  meanPC3
## 16326    260262    16868 196.5344 62.17091 161.9491
## 16326.1  260262    16868 194.0083 58.94373 150.7413
## 16326.2  260262    16868 197.4447 63.64964 151.1167
## 16326.3  260262    16868 195.2878 61.83187 159.4040
## 16326.4  260262    16868 195.4166 62.67403 164.4191
## 16326.5  260262    16868 198.4160 60.56191 150.3830
## 16326.6  260262    16868 199.2306 62.64557 153.7078
## 16326.7  260262    16868 198.0056 59.40253 159.4608
## 16326.8  260262    16868 200.4795 63.99562 160.8465
## 16326.9  260262    16868 192.4961 58.31090 162.3158
## 16326.10 260262    16868 198.8244 58.17159 160.2379
#plot(segMSPCArndr@data[,3:4],type="p",pch=20,col=alpha("black",0.10))
#plot(segMSPCArndr)
#save(segMSPCArndr,file="segMSPCArndr.rda")
#load("segMSPCArndr.rda")

3. Clustering

#make sure require(fastcluster): really much faster!
segMSPCArndrhc <- hclust(dist(segMSPCArndr@data[,3:5]),method="ward.D2")

plot(2:nrow(segMSPCArndr)+1,rev(segMSPCArndrhc$height))

plot of chunk unnamed-chunk-6

plot(2:11,rev(segMSPCArndrhc$height)[1:10])

plot of chunk unnamed-chunk-6

hclass7r <- cutree(segMSPCArndrhc,k=7)
colorCodes7 = c("red","green","blue","cyan","yellow","pink","orange")
plot(segMSPCArndrhc,hang=-1, labels=FALSE, xlab="",sub="")
rect.hclust(segMSPCArndrhc, k=7,cluster = hclass7r,border =colorCodes7)
legend("topright",legend=paste0("Cluster_",1:7),cex=0.75,lty=1,
       col=colorCodes7)

plot of chunk unnamed-chunk-6

4. Plot in PCA plane

colorines7 <- mapvalues(hclass7r,from=unique(hclass7r),to=colorCodes7)
plot(segMSPCArndr@data[,3:4],type="n",xlab="PC1",ylab="PC2")
text(segMSPCArndr@data[,3:4],label=hclass7r,cex=0.75,col=alpha(colorines7,0.75))

plot of chunk unnamed-chunk-7

5. Save results

segMSPCArndr@data$hclass7r <- hclass7r
writeOGR(segMSPCArndr,dsn="segMSPCArndr",layer="segMSPCArndr",
         driver="ESRI Shapefile", overwrite=TRUE)
save(segMSPCArnd,file="segMSPCArndr.rda")