args <- commandArgs(trailingOnly = TRUE)
intab <- args[1]
outfile <- args[2]
cnames <- args[3]
k <- args[4]
data=read.table(intab, sep="\t", row.names=1)
pdf(outfile)
cn=strsplit(cnames, ',')
library(gplots)
d=as.matrix(sapply(data, as.numeric))
#d=round(log2(d + 1.1), digits=2)
colnames(d)=unlist(cn)
#only label rows if there are only a few
labels=FALSE
if (nrow(d)<50) {
   labels=rownames(data)
}

# gaggleUtil.R: general purpose (primarily convenience) R functions developed for microarray 
# data in the gaggle
##------------------------------------------------------------------------------------------------
normalize <- function (m)
# return a matrix which in which each row is normalized
{
  result <- m
  for (r in 1:dim (m)[1]) {
    avg <- mean (m [r,], na.rm=TRUE)       
    variance <- var (m [r,], na.rm=TRUE)    
    result [r,] <- result [r,] - avg             #center
    result [r,] <- result [r,] / sqrt (variance) #scale
    }

  invisible (result)

}
#row normalize
nd = normalize(d)

#determine k to use for kmeans???
# Determine number of clusters from (http://www.statmethods.net/advstats/cluster.html)
wss <- (nrow(nd)-1)*sum(apply(nd,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(nd, centers=i, iter.max=20)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
#look for bend in plot
# K-Means Cluster Analysis
w = kmeans(nd, k, iter.max = 20)
or = order(w$cluster)
clus=list()
#clusn=list()
for (i in 1:max(w$cluster)) {
   c=(length(w$cluster[w$cluster<=i]))
   #n=(length(w$cluster[w$cluster==i]))
   #p=(length(w$cluster[w$cluster<i]))
   #m=p+floor(n/2)
   clus=c(clus, c)
   #clusn=c(clusn, m) 
}
#plot
#set breaks for colors
s=quantile(nd, c(.05))
e=quantile(nd, c(.95))
cnt=101
step = (e - s)/cnt
breaks = c(seq(s,e,by=step))
dim(breaks)
breaks[1]=min(nd)
breaks[cnt+1]=max(nd)
#num=1:max(w$cluster)
#default is row scale (row standardized?) can't do this and breaks???
heatmap.2(nd[or,], dendrogram="none", trace="none", col=colorRampPalette(c("green","lightyellow","red")), labRow=labels, Colv=FALSE,Rowv=FALSE, breaks=breaks, rowsep=clus, sepcolor="black", sepwidth=c(.1,.1))
#adding cluster labels doesn't line up
#add.expr=mtext(num, side=4, at=clusn, las=1)
dev.off()

#print table 
colnames(data)=unlist(cn)
pd=cbind(data, w$cluster)
pd=pd[or,]
write.table(pd, file="geneClusters.txt", sep="\t")
q()