################################################################################ ## Project PuMaQC - Quality Assesment / Quality Control ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## This script Pperforms Quality Control with identification of low quality arrays ## Input: ## ini.file - INI file name with full path ## src.path - folder with PuMaQC scripts (default - online) ## Output: ## Summary Repport with Quality Metrics plots ## Table in text format with expression data for good quality arrays ## ## (c)GNU GPL P.V.Nazarov, J.P.Corte-Real, 2011 ################################################################################ PuMaQC.QC = function(ini.file, src.path = "http://sablab.net/PuMaQC") { source("http://sablab.net/scripts/parseINI.r") source("http://sablab.net/scripts/plotDataPDF.r") source("http://sablab.net/scripts/drawTable.r") source("http://sablab.net/scripts/xCorrMatrix.r") source("http://sablab.net/scripts/plotProfile.r") setwd(sub("/[^/]+$","",ini.file)) Info = parseINI(ini.file, correct.bslash=T) ## Get results of the search if (is.null(Info$Import$Results)) Info$Import$Results="" if (!file.exists(Info$Import$Results)) Info$SeImportarch$Results="importResults.txt" if (!file.exists(Info$Import$Results)) stop("Cannot find file with imported data! Repeat search/import.") Meta = read.table(file=ifelse(length(Info$Import$Results)>0,Info$Import$Results,"importResults.txt"), sep="\t",header=T,as.is=T,quote="\"",comment.char="") ##============================================================================== ## Load data generated by ATP ##============================================================================== Data0 = read.table("apt/pm-only.median.summary.txt",header=T,sep="\t",as.is=T) names(Data0) = sub(".[c|C][e|E][l|L]$","",names(Data0)) if (max(Data0[,-1]) > 100) Data0[,-1] = log2(Data0[,-1]) cat("Not-normalized data are loaded.\n") Data = read.table("apt/rma-sketch.summary.txt",header=T,sep="\t",as.is=T) names(Data) = sub(".[c|C][e|E][l|L]$","",names(Data)) if (max(Data[,-1]) > 100) Data = log2(Data) cat("Normalized data are loaded.\n") ## correct for possible swappings in order of GSMs in Meta and Data assignElements = function(x,vec) which(x==vec) assignElements(Meta$gsm[1],names(Data)[-1]) idx = as.integer(lapply(Meta$gsm,assignElements,names(Data)[-1])) Meta = Meta[idx,] Rep0 = read.table("apt/pm-only.median.report.txt",sep="\t",header=T,as.is=T) Rep = read.table("apt/rma-sketch.report.txt",sep="\t",header=T,as.is=T) ##============================================================================== ## Visualization: first page ##============================================================================== pdf("PuMaQC-3_QC.pdf",width=8.3, height=11.7,onefile=TRUE) palL = colorRampPalette(c("lightpink","palegoldenrod","palegreen","paleturquoise","lightskyblue","mediumpurple1")) palD = colorRampPalette(c("darkred","gold4","darkgreen","turquoise4","darkblue","purple4")) plot.new() title("\nPuMaQC Report 3. Quality Control/Quality Assessments") txt = paste( "Files imported at PREPROCESSING session are ready for quality control.") text(0,1,txt,adj=c(0,1)) tmp=c("Files imported:","Series imported:","Number of features:") tmp=data.frame(matrix(nc=1,nrow=length(tmp))) row.names(tmp)=c("Files imported:","Series imported:","Number of features:") names(tmp)="" tmp[1,1] = nrow(Meta) tmp[2,1] = nlevels(as.factor(Meta$series_id)) tmp[3,1] = nrow(Data) drawTable(tmp,dx=0.27,dy=0.025) if (length(Info$QC$arrayQualityMetrics)>0) if (Info$QC$arrayQualityMetrics %in% c("T","1","TRUE")) text(0,0.89,"Check /arrayQM/ folder for QC analysis by arrayQualityMetrics package.",adj=c(0,1)) # show Meta tmp = data.frame(Meta$gsm, Meta$series_id, Meta$submission_date, Meta$organism_ch1, Meta$source_name_ch1, Meta$characteristics_ch1, stringsAsFactors=F) names(tmp) = c("GSM","Series","Date","Organism","Name","Characteristics") drawTable(tmp,y0=0.85,dx=c(0.05,0.1,0.1,0.1,0.1,0.25,1),dy=0.01,cex=0.5,bg="white") ################################################################################ ## Visualization: distributions ################################################################################ par(mfcol=c(2,2)) boxplot(Data0[,-1], col=rainbow(ncol(Data0)), outline=F,las=2,cex.axis=0.6,cex.main=1,ylab="Log2 expression", main="Not-normalized Data",notch=T) plotDataPDF(Data0,main="",col=rainbow(ncol(Data0)-1)) boxplot(Data[,-1], col=rainbow(ncol(Data)), outline=F,las=2,cex.axis=0.6,cex.main=1,ylab="\nLog2 expression", main="Normalized Data",notch=T) plotDataPDF(Data,main="",col=rainbow(ncol(Data)-1)) ################################################################################ ## Heatmap ################################################################################ XCorr = xCorrMatrix(Data[,-1],names=names(Data[,-1])) col.series=palD(nlevels(as.factor(Meta$series_id))) heatmap(XCorr, #col=colorRampPalette(c("blue","green","yellow","red"))(1000), col=colorRampPalette(c("darkblue","gold"))(1000), #ColSideColors=rainbow(nlevels(as.factor(Meta$series_id)))[as.factor(Meta$series_id)], ColSideColors=col.series[as.factor(Meta$series_id)], RowSideColors=rainbow(ncol(Data)-1), scale="none",revC=T, main="Between-array Correlation\n\n", cex.main=1) par(mfcol=c(2,2),new=T) plot(NA,NA,new=T,ylim=c(0,1),xlim=c(min(XCorr,na.rm=T),max(XCorr,na.rm=T)), yaxt="n",bty="n",las=2,ylab="",xlab="correlation values",main="") minE=min(XCorr,na.rm=T) maxE=max(XCorr,na.rm=T) for(i in 1:100){ rect(minE + (maxE-minE)*(i-1)/100,-0.055, minE + (maxE-minE)*(i-0)/100,0.0, col=colorRampPalette(c("darkblue","gold"))(100)[i],xpd=NA,border = NA) } ################################################################################ ## QC (Ours) ################################################################################ if (length(Info$QC$Alpha)==0){ cat("No paramenter 'Alpha' found! It is set now to the default value 0.05") alpha=0.05 }else{ alpha=as.double(Info$QC$Alpha) if ((alpha<=0)|(alpha>1)){ cat("Wrong paramenter 'Alpha' (",alpha,")! Changing to the default 0.05") alpha=0.05 } } par(mfcol=c(2,1)) QC = data.frame(matrix(nrow=nrow(Meta),ncol=4)) names(QC)= c("cor","rle","expr","ma") QC.names=c("Mean Correlation","Mean of RLE","Mean Log2 Expression","MA-plot Bias") row.names(QC) = names(Data)[-1] QC$cor = (apply(XCorr,2,sum)-1) / (nrow(XCorr)-1) QC$rle = Rep$all_probeset_rle_mean QC$expr = apply(Data0[,-1],2,"mean") ## MA M = as.matrix(Data[,-1]) A = M expr.mean = apply(M,1,mean) getMA.m=function(x,ref) {x-ref} getMA.a=function(x,ref) {(x+ref)/2} M = apply(M,2,"getMA.m",expr.mean) A = apply(A,2,"getMA.a",expr.mean) for (i in 1:ncol(M)){ tmp = lowess(rank(A[,i]),M[,i]) #lowess(A[,i],M[,i]) QC$ma[i]=sqrt(sum(tmp$y^2)) } print(xCorrMatrix(QC,names=names(QC))) QC.lim = as.data.frame(matrix(ncol=ncol(QC),nr=2)) names(QC.lim)=names(QC) row.names(QC.lim)=c("min","max") QC.lim$cor[1] = qnorm(alpha)*sd(QC$cor) + mean(QC$cor) QC.lim$cor[2] = +Inf QC.lim$rle[1] = -Inf QC.lim$rle[2] = qnorm(1-alpha)*sd(QC$rle) + mean(QC$rle) QC.lim$expr[1] = qnorm(alpha)*sd(QC$expr) + mean(QC$expr) QC.lim$expr[2] = Inf QC.lim$ma[1] = -Inf QC.lim$ma[2] = qnorm(1-alpha)*sd(QC$ma) + mean(QC$ma) for (i in 1:ncol(QC)){ plotProfile(QC[,i],main=QC.names[i],col=rainbow(ncol(Data)-1), names.arg=Meta$gsm,cex.names=0.6, cex.main=1) # barplot(QC[,i],main=QC.names[i],col=rainbow(ncol(Data)-1), # names.arg=Meta$gsm,cex.names=0.6, cex.main=1,las=2, # ylim=c(0,max(QC[,i])*1.2)) if (is.finite(QC.lim[1,i])){ ## if limint on min value abline(h=seq(QC.lim[1,i],min(QC[,i])-(max(QC[,i])-min(QC[,i]))*0.1, length.out=10),col="grey",lty=2) abline(h=QC.lim[1,i],col="red",lwd=2) }else{ abline(h=seq(QC.lim[2,i],max(QC[,i])+(max(QC[,i])-min(QC[,i]))*0.1, length.out=10),col="grey",lty=2) abline(h=QC.lim[2,i],col="red",lwd=2) } } par(mfcol=c(2,1)) QScore = QC for (i in 1:ncol(QC)){ if (is.finite(QC.lim[1,i])){ ## if limint on min value QScore[,i] = as.double(scale(QC[,i]))-qnorm(alpha) }else{ QScore[,i] = -as.double(scale(QC[,i]))+qnorm(1-alpha) } } ## number of QC metrics border violation color = paste(character(ncol(QC)+1),"gray",sep="") color[1] = "darkgreen" ## no violation. good quality color[2] = "yellow" color[3] = "orange" color[4] = "red" color[5] = "black" barplot(apply(QScore,1,mean),main="Array Quality Summarization", col=color[apply(QScore<0,1,sum)+1], names.arg=Meta$gsm,cex.names=0.6, cex.main=1,las=2, ylab="Average quality") legend(x="topright", legend=paste((1:(ncol(QC)+1))-1,"case"), col=color,pch=15,title="Quality violation",bg="white",cex=0.7) points(1.2*(1:nrow(QScore))-0.5,rep(0.5,nrow(QScore)),pch="*",font=2,col= as.integer(apply(QScore<0,1,sum)>=2)*2) plot.new() text(0,1,"Summary",adj=c(0,1),font=2) txt = paste("Quality control was performed on",nrow(Meta),"microarray experiments with alpha =",alpha,"using",ncol(QC),"metrics:\n", paste(QC.names,collapse=", "),".\n", "Microarray experiments meet quality limits in the following way:\n", " * 0 violation: ",sum(apply(QScore<0,1,sum) == 0),"\n", " * 1 violation: ",sum(apply(QScore<0,1,sum) == 1),"\n", " * 2 violations:",sum(apply(QScore<0,1,sum) == 2),"\n", " * 3 violations:",sum(apply(QScore<0,1,sum) == 3),"\n", " * 4 violations:",sum(apply(QScore<0,1,sum) == 4),"\n", "It is suggested to exclude",sum(apply(QScore<0,1,sum) > 1),"microarray experiments.\n", "Good quality data are saved to ",Info$QC$Results,"\n", "\n\n\n", "PuMaQC, ",date()) text(0,0.9,txt,adj=c(0,1),cex=0.8) dev.off() QCTab=data.frame(Meta$gsm, Meta$series_id, QC, QScore, apply(QScore,1,mean),apply(QScore<0,1,sum),stringsAsFactors=F) names(QCTab) = sub("[.]1$",".zscore",names(QCTab)) names(QCTab)[1]="gsm" names(QCTab)[2]="gse" names(QCTab)[ncol(QCTab)-1] = "meanQuality" names(QCTab)[ncol(QCTab)] = "violations" write.table(QCTab, file="QCQA.txt", sep="\t", row.names=F,quote=F) rm(Meta) rm(Data0) ################################################################################ ## Annotation and saving anno.file = NA if (length(Info$QC$Annotate)>0) if (toupper(as.character(Info$QC$Annotate)) %in% c("T","TRUE","1","Y","YES")){ cat("Annotate data...\n") # check whether file exists anno.file = Info$QC$Annotation if (length(anno.file)==0)anno.file="" platforms = read.table(file.path(src.path,"platforms.txt"),header=T,sep="\t",quote="\"",comment.char="",as.is=T) if (anno.file != "" & !file.exists(anno.file)) anno.file = "" if (anno.file == "") { cat("annotation file is not specified. Connecting to sablab.net/PuMaQC/lib...\n") if(Info$Search$Platform %in% platforms$Platform) { ## read platforma available at lib html=scan("http://sablab.net/PuMaQC/lib/",what="",sep="\n") html=html[grep("[.]csv[.]gz",html)] html = sub(".+href=\"","",sub("\">.+","",html)) anno.file = html[grep(paste("^",Info$Search$Platform,sep=""),html)] if (length(anno.file)>1) anno.file=anno.file[length(anno.file)] download.file(file.path("http://sablab.net/PuMaQC/lib",anno.file), method="internal",destfile = anno.file,mode="wb") require(R.utils) #gunzip(filename=anno.file, destfile = sub("[.]gz$","",anno.file),overwrite=T) gunzip(filename=anno.file, overwrite=T) anno.file = sub("[.]gz$","",anno.file) }else{ cat("ERROR: platform alias is not specified. Specify either platform alias or annotation file.\n") } } if (file.exists(anno.file)){ cat("Saving annotated data file...\n") Anno = read.csv(anno.file, quote = "\"",comment.char="#",stringsAsFactors =F) row.names(Anno)=Anno[,1] write.table(cbind(Anno[Data[,1],],Data[,c(T,apply(QScore<0,1,sum) <= 1)]), file=Info$QC$Results, row.names=F,quote=F,sep="\t") } else { anno.file = NA } } if (is.na(anno.file)){ cat("Saving data file...\n") write.table(Data[,c(T,apply(QScore<0,1,sum) <= 1)], file = Info$QC$Results, sep="\t", row.names=F,quote=F) } ################################################################################ ## Check and do arrayQualityMetrics if needed if (length(Info$QC$arrayQualityMetrics)>0) if (Info$QC$arrayQualityMetrics %in% c("T","1","TRUE")) { library(arrayQualityMetrics) eSet=new("ExpressionSet", exprs=as.matrix(Data[,-1])) arrayQualityMetrics(expressionset = eSet,outdir = "arrayQM",force = T,do.logtransform = FALSE) } }