################################################################################ ## Project PuMaQC - Preprocessing (part 2/2 of Import) ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## This script reads downloaded files, eliminates invalid formats ## and performs preprocessing of CEL files ## Input: ## ini.file - INI file name with full path ## src.path - folder with PuMaQC scripts (default - online) ## Output: ## Summary report ## APT folder with results for not-normalized and normalized data ## ## (c)GNU GPL P.V.Nazarov, J.P.Corte-Real, 2012-02-07 ################################################################################ PuMaQC.Preprocessing = function(ini.file, src.path = "http://sablab.net/PuMaQC") { ## some necessary functions source("http://sablab.net/scripts/parseINI.r") source("http://sablab.net/scripts/drawTable.r") source("http://sablab.net/scripts/plotProfile.r") ## set default folder to the same folder as INI, if we are not there already if (length(grep("/",ini.file))>0){ setwd(sub("/[^/]+$","",ini.file)) }else{ ini.file = file.path(getwd(),ini.file) } Info = parseINI(ini.file, correct.bslash=T) ##======================================================= ## Validate APT and CDF ## check the folder for APT and CDF key.apt = Info$Import$APT while (T) { if (key.apt!=""){ res=try(system(paste("\"",file.path(key.apt,"apt-probeset-summarize\" -h"),sep=""),intern=T)) }else{ res=try(system("apt-probeset-summarize -h",intern=T)) } cat("\n\nValidating APT path\n") cat("Your APT command folder is:",key.apt,"\n") if (length(grep("^Error",res[1]))==0) { cat("...apt-probeset-summarize is tested and can be run.\n") Info$Import$APT = key.apt break; }else{ cat("...apt-probeset-summarize is not found there!\n") } cat("Enter the valid location of apt-probeset-summarize.exe:\n") key.apt = scan(n=1,what="") if (length(key.apt)==0) key.apt = "" } ## check the CDF file key.cdf = Info$Import$CDF while (T) { cat("\n\nValidating CDF\n") cat("Your CDF file is:",key.cdf,"\n") if (file.exists(key.cdf)){ tmp=try(readBin(key.cdf,what="int",n=4,size=1)) if (length(tmp)==4){ if (tmp[1]==91 && tmp[2]==67 && tmp[3]==68 && tmp[4]==70){ cat("...CDF file is found and valid.\n") Info$Import$CDF = key.cdf break; } else { cat("...CDF file is in wrong format (should start with [CDF] header)!\n") } } else{ cat("...wrong CDF file name or file is empty!\n") } } else{ cat("...CDF file is not found!\n") } cat("Enter the valid location of CDF file:\n") key.cdf = scan(n=1,what="") } ## end validation ##---------------------------------------------------- ## Get results of the search if (is.null(Info$Search$Results)) Info$Search$Results="" if (!file.exists(Info$Search$Results)) Info$Search$Results="selectedGSM.txt" if (!file.exists(Info$Search$Results)) stop("Cannot find file with selected data! Repeat search.") ## reading results of the previous operation Meta = read.table(file=Info$Search$Results,sep="\t",header=T,as.is=T,quote="\"",comment.char="") ## check whether data were downloaded if (length(grep("PuMaQC",names(Meta)))==0) stop("Downloading information is absent. Please repeat downloading") ## rescan file size and format (to avoid artifact after re-running) Meta$PuMaQC.size = file.info(Meta$PuMaQC.file)$size for (i in 1:nrow(Meta)){ Meta$PuMaQC.code[i]= readBin(Meta$PuMaQC.file[i],what="integer",size=2) } ##---------------------------------------------------------------------------- ## Define Known Affy Formats cel.formats = data.frame(matrix(nr=3,nc=2)) names(cel.formats) = c("code","descr") cel.formats[,1] = c(64,17243,315) cel.formats[,2] = c("CEL ver.4, binary XDA file generated by GCOS", "CEL ver.3, ASCII file generated by MAS", "CEL command console ver.1") ## here we filter "CEL" of wrong format ## and those which are not downloaded completely idx.bad = logical(nrow(Meta)) idx.bad [!(Meta$PuMaQC.code %in% cel.formats[,1])] = T sd.limit.up = 10 sd.limit.down = 5 # those which are deviating down more than sd.limit*sd()+1Kb will be removed # check size for each type of array for (i in 1:ncol(cel.formats)) { hdr =cel.formats[i,1] idx = Meta$PuMaQC.size[Meta$PuMaQC.code==hdr] > mean(Meta$PuMaQC.size[Meta$PuMaQC.code==hdr]) + sd.limit.up*sd(Meta$PuMaQC.size[Meta$PuMaQC.code==hdr])+1000 idx = idx | Meta$PuMaQC.size[Meta$PuMaQC.code==hdr] < mean(Meta$PuMaQC.size[Meta$PuMaQC.code==hdr]) - sd.limit.down*sd(Meta$PuMaQC.size[Meta$PuMaQC.code==hdr])-1000 if(length(idx)>0) idx.bad[Meta$PuMaQC.code==hdr][idx] = T } if (names(Meta)[ncol(Meta)]=="PuMaQC") Meta=Meta[-ncol(Meta)] Meta = cbind(Meta,as.integer(!idx.bad)) names(Meta)[ncol(Meta)]="PuMaQC" ##============================================================================== ## Report p.1 ##============================================================================== pdf("PuMaQC-2_Preprocessing.pdf",width=8.3, height=11.7,onefile=TRUE) palL = colorRampPalette(c("lightpink","palegoldenrod","palegreen","paleturquoise","lightskyblue","mediumpurple1")) palD = colorRampPalette(c("darkred","gold4","darkgreen","turquoise4","darkblue","purple4")) plot.new() title("PuMaQC Report 2. Download and Preprocessing Summary") txt = paste("Of",nrow(Meta),"files, found during database search,", sum(Meta$PuMaQC.size.gz>0),"were downloaded and unzipped.\n\n", "Consistancy analysis revealed", nlevels(as.factor(Meta$PuMaQC.code)), "formats with starting 2-byte codes: ", paste(levels(as.factor(Meta$PuMaQC.code)),collapse=", "),"\n", "Only",paste(cel.formats[,1],collapse="/"),"are recognized. In addition, file sizes were checked and outliers\n", "removed. Finally ",sum(Meta$PuMaQC)," arrays are available for preprocessing.") text(0,1,txt,adj=c(0,1)) par(fig=c(0,1,0,0.7),new=T) barplot(Meta$PuMaQC.size/2^20,las=2, main="Sizes of unpacked CEL files", cex.main=1, ylab="Size(MB)", names.arg =casefold(Meta$gsm),cex.names=0.5, col=palL(nlevels(as.factor(Meta$PuMaQC.code)))[as.integer(as.factor(Meta$PuMaQC.code))]) legend(x="topright", legend=levels(as.factor(Meta$PuMaQC.code)), col=palL(nlevels(as.factor(Meta$PuMaQC.code)))[1:nlevels(as.factor(Meta$PuMaQC.code))],pch=15,title=" 2-byte code:",cex=0.7,bg="white") ## now remove arrays with bad format Meta=Meta[Meta$PuMaQC == 1,] ##============================================================================== ## If necessary - run APT converter ##============================================================================== ## ToDo: Check for path which contains " " - can lead to problems ## if some files need to be converted if (sum(Meta$PuMaQC.code==17243)!=0) { ## generate list of files to be converted tmp=as.data.frame(Meta$PuMaQC.file[Meta$PuMaQC.code==17243]) names(tmp)="cel_files" write.table(tmp,file="apt_cel_files.txt",row.names=F,quote=F) if (Sys.info()[1] == "Windows") { ## if temporary folder does not exist - create it if (!file.exists(file.path(Info$Import$DownloadTo,"converted.tmp"))) shell(paste("md ",gsub("/","\\\\",Info$Import$DownloadTo),"\\converted.tmp",sep=""),wait=T) ## form command cmd = paste("apt-cel-convert\"", " --format xda", " --out-dir ",gsub("/","\\\\",Info$Import$DownloadTo),"\\converted.tmp", " --cel-files apt_cel_files.txt", sep="") ## run command shell(paste("\"",file.path(Info$Import$APT,cmd),sep=""),wait=T) ## move converted files shell(paste("move /Y ", gsub("/","\\\\",Info$Import$DownloadTo),"\\converted.tmp\\* ", gsub("/","\\\\",Info$Import$DownloadTo),sep=""),wait=T) ## delete tmp folder shell(paste("rd ",gsub("/","\\\\",Info$Import$DownloadTo),"\\converted.tmp",sep=""),wait=T) } if (Sys.info()[1] == "Linux") { ## if temporary folder does not exist - create it if (!file.exists(file.path(Info$Import$DownloadTo,"converted.tmp"))) system(paste("mkdir ",Info$Import$DownloadTo,"/converted.tmp",sep=""),wait=T) ## form command cmd = paste("apt-cel-convert", " --format xda", " --out-dir ",Info$Import$DownloadTo,"/converted.tmp", " --cel-files apt_cel_files.txt", sep="") ## run command if (Info$Import$APT!="") { system(paste(file.path(Info$Import$APT,cmd),sep=""),wait=T) }else{ system(cmd,wait=T) } ## move converted files system(paste("mv -f ", Info$Import$DownloadTo,"/converted.tmp/* ", Info$Import$DownloadTo,sep=""),wait=T) ## delete tmp folder system(paste("rmdir ",Info$Import$DownloadTo,"/converted.tmp",sep=""),wait=T) } } ##============================================================================== ## Preprocessing: data summarization and normalization. Produce 2 files: ## raw data: apt/pm-only.median.summary.txt ## RMA normalized data: apt/rma-sketch.summary.txt tmp=as.data.frame(Meta$PuMaQC.file) names(tmp)="cel_files" write.table(tmp,file="apt_cel_files.txt",row.names=F,quote=F) if (Sys.info()[1] == "Windows"){ cmd = paste("apt-probeset-summarize\"", " -a rma-sketch -a pm-only,median", " -d ",gsub("/","\\\\",Info$Import$CDF), " -o apt ", gsub("/","\\\\",getwd()), " --cel-files apt_cel_files.txt", " -f true", sep="") shell(paste("\"",file.path(Info$Import$APT,cmd),sep=""),wait=T) } if (Sys.info()[1] == "Linux"){ cmd = paste("apt-probeset-summarize", " -a rma-sketch -a pm-only,median", " -d ",Info$Import$CDF, " -o apt ", getwd(), " --cel-files apt_cel_files.txt", " -f true", sep="") if (Info$Import$APT!="") { system(paste(file.path(Info$Import$APT,cmd),sep=""),wait=T) }else{ system(cmd,wait=T) } } ## save results write.table(Meta, file=ifelse(length(Info$Import$Results)>0,Info$Import$Results,"importResults.txt"), sep="\t",row.names=F, col.names=T,quote=T) ##============================================================================== ## Visualization of the results ##============================================================================== Rep=list() Rep$pm = read.table("apt/pm-only.median.report.txt",sep="\t",header=T,as.is=T) Rep$rma = read.table("apt/rma-sketch.report.txt",sep="\t",header=T,as.is=T) str(Rep$pm) str(Rep$rma) tit =c("Not normalized","RMA and QQ normalization") for (ime in 1:2) { par(mfcol=c(4,1)) plotProfile(Rep[[ime]]$pm_mean, main=paste("Data Summarization: ",tit[ime],"\n"),cex.main=1.2, ylab="PM Mean", names.arg =casefold(Meta$gsm),cex.names=0.6,col=rainbow(nrow(Rep[[ime]]))) plotProfile(Rep[[ime]]$all_probeset_mean, main="All probsets Mean",cex.main=1, ylab="Mean Signal", names.arg =casefold(Meta$gsm),cex.names=0.6,col=rainbow(nrow(Rep[[ime]]))) plotProfile(Rep[[ime]]$all_probeset_stdev, main="All probsets StDev",cex.main=1, ylab="StDev Signal", names.arg =casefold(Meta$gsm),cex.names=0.6,col=rainbow(nrow(Rep[[ime]]))) plotProfile(Rep[[ime]]$all_probeset_rle_mean, main="All probsets RLE",cex.main=1, ylab="RLE", names.arg =casefold(Meta$gsm),cex.names=0.6,col=rainbow(nrow(Rep[[ime]]))) } dev.off() return(nrow(Meta)) }