################################################################################ ## Project PuMaQC - Downloading (part 1/2 of Import) ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## This script downloads raw data from GEO and unzips them ## Input: ## ini.file - INI file name with full path ## src.path - folder with PuMaQC scripts (default - online) ## Output: number of downloaded arrays ## ## (c)GNU GPL P.V.Nazarov, J.P.Corte-Real, updated 2011-12-16 ################################################################################ #GSM653194 PuMaQC.Download = function(ini.file, src.path = "http://sablab.net/PuMaQC", repeat.ungzip =FALSE) { ## some necessary functions source("http://sablab.net/scripts/parseINI.r") ## set default folder to the same folder as INI, if we are not there already if (length(grep("/",ini.file))>0){ setwd(sub("/[^/]+$","",ini.file)) }else{ ini.file = file.path(getwd(),ini.file) } Info = parseINI(ini.file, correct.bslash=T) ## Get search results if (is.null(Info$Search$Results)) Info$Search$Results="" if (!file.exists(Info$Search$Results)) Info$Search$Results="selectedGSM.txt" if (!file.exists(Info$Search$Results)) stop("Cannot find file with selected data! Repeat search.") ## Reading results of previous operation Meta = read.table(file=ifelse(length(Info$Search$Results)>0,Info$Search$Results,"searchResults.txt"), sep="\t",header=T,as.is=T,quote="\"",comment.char="") ################################################################################ ## Download selected data ################################################################################ ## Prepare folder for downloading Info$Import$DownloadTo = sub(" +$","",Info$Import$DownloadTo) Info$Import$DownloadTo = sub("/$","",Info$Import$DownloadTo) ## Create folder to download (if possible) if (!file.exists(Info$Import$DownloadTo)) { cat("Folder to store downloaded data is not found. Trying to create...\n") if (Sys.info()[1] == "Windows") try(shell(paste("mkdir",gsub("/","\\\\",file.path(getwd(),Info$Import$DownloadTo))))) if (Sys.info()[1] == "Linux") try(system( paste("mkdir",file.path(getwd(),Info$Import$DownloadTo)) )) if (!file.exists(Info$Import$DownloadTo)) stop("Cannot create folder to store dowloaded data!\n") } ## Since the download of big datasets can be interupted we try to download several times ## ntry - number of tries ntry = 3 for (itry in 1:ntry){ cat("Downloading. Try #",itry," of ",ntry,"\n") flush.console() ## delete empty files full.fnames = file.path(Info$Import$DownloadTo,dir(Info$Import$DownloadTo)) ## kill files with size less then 100kB file.remove(full.fnames[file.info(full.fnames)$size < 100*1024]) ## check ready files gsms=dir(Info$Import$DownloadTo,pattern=".gz$") gsms=sub(".[c|C][e|E][l|L].gz","",gsms) counter=0 for (i in 1:length(Meta$gsm)){ if (!(Meta$gsm[i] %in% gsms)) { ## standard FTP download does not work!!! >> try(getGEOSuppFiles(Meta$gsm[i],makeDirectory=FALSE)) ## therefore we use manual download via HTTP #link=paste("http://www.ncbi.nlm.nih.gov/projects/geo/query/acc.cgi?mode=raw&acc=",Meta$gsm[i],"&db=",gsub("[.]","%2E",sub(".+/","",Meta$supplementary_file[i])),"&is_ftp=true",sep="") # www.ncbi.nlm.nih.gov/geosuppl/?acc=GSM175977&file=GSM175977%2ECEL%2Egz file.ext="" if (length(grep("[.]CEL[.]gz",Meta$supplementary_file[i]))>0) file.ext =".CEL.gz" if (length(grep("[.]cel[.]gz",Meta$supplementary_file[i]))>0) file.ext =".cel.gz" if (length(grep("[.]Cel[.]gz",Meta$supplementary_file[i]))>0) file.ext =".Cel.gz" file.name=sub(paste(file.ext,".+",sep=""),"",Meta$supplementary_file[i]) file.name=sub(paste(file.ext,"$",sep=""),"",file.name) file.name=gsub(".+/","",file.name) # file.name # link=paste("http://www.ncbi.nlm.nih.gov/geosuppl/?acc=", # Meta$gsm[i],"&file=",Meta$gsm[i],"%2ECEL%2Egz",sep="") #Meta$gsm[i] # link=paste("http://www.ncbi.nlm.nih.gov/projects/geo/query/acc.cgi?mode=raw&acc=", # Meta$gsm[i],"&db=",file.name,file.ext,"&is_ftp=true",sep="") link=paste("http://www.ncbi.nlm.nih.gov/geosuppl/?acc=", Meta$gsm[i],"&file=",file.name,file.ext,sep="") #Meta$gsm[i] cat("#",i,": downloading ",Meta$gsm[i],"\n") download.file(url = link,mode="wb", dest=file.path(Info$Import$DownloadTo,paste(Meta$gsm[i],"CEL","gz",sep="."))) counter=counter+1 }else { cat("#",i,": do nothing for ",Meta$gsm[i],"(already downloaded)\n") } flush.console() } cat(sprintf("%d arrays have been downloaded on try %d",counter,itry)) cat(counter," arrays have been downloaded on try ",itry,"\n") flush.console() ## delete empty files full.fnames = dir(Info$Import$DownloadTo) file.remove(full.fnames[file.info(full.fnames)$size == 0]) ## check ready files gsms=dir(Info$Import$DownloadTo,pattern=".gz$") gsms=sub(".[c|C][e|E][l|L].gz","",gsms) if (sum(!(Meta$gsm %in% gsms))==0) break; if (counter==0) break; } ################################################################################ ## Unpack the files and remove 0Mb size ################################################################################ cat("\nUnpacking.\n") if (length(grep("PuMaQC",names(Meta)))>0){ Meta = Meta[,-grep("PuMaQC",names(Meta))] } Meta = cbind(Meta,"PuMaQC.file.gz","PuMaQC.file","PuMaQC.size.gz","PuMaQC.size","PuMaQC.code",stringsAsFactors=F) names(Meta) = gsub("\"","",names(Meta)) Meta$PuMaQC.file.gz=file.path(Info$Import$DownloadTo,paste(Meta$gsm,"CEL","gz",sep=".")) Meta$PuMaQC.file =file.path(Info$Import$DownloadTo,paste(Meta$gsm,"CEL",sep=".")) Meta$PuMaQC.size.gz=file.info(Meta$PuMaQC.file.gz)$size Meta$PuMaQC.size=NA Meta$PuMaQC.code=NA require(R.utils) for (i in 1:length(Meta$gsm)){ if (file.exists(Meta$PuMaQC.file.gz[i])){ if (!file.exists(Meta$PuMaQC.file[i]) | repeat.ungzip) { Meta$PuMaQC.size[i]=gunzip(Meta$PuMaQC.file.gz[i],overwrite=T,remove=F) cat(sprintf("#%d[%d]: %s (of %d B) is un-GZipped (to %d B)\n",i,length(Meta$gsm),Meta$PuMaQC.file.gz[i],Meta$PuMaQC.size.gz[i],Meta$PuMaQC.size[i])) } else { Meta$PuMaQC.size[i]=file.info(Meta$PuMaQC.file[i])$size cat(sprintf("#%d[%d]: %s (of %d B) has been already un-GZipped (to %d B)\n",i,length(Meta$gsm),Meta$PuMaQC.file.gz[i],Meta$PuMaQC.size.gz[i],Meta$PuMaQC.size[i])) } Meta$PuMaQC.code[i]= readBin(Meta$PuMaQC.file[i],what="integer",size=2) }else{ cat(sprintf("#%d[%d]: %s file not found!",i,length(Meta$gsm),Meta$PuMaQC.file.gz[i])) Meta$PuMaQC.size.gz[i]=NA Meta$PuMaQC.size[i]=NA } flush.console() } write.table(Meta,file=ifelse(length(Info$Search$Results)>0,Info$Search$Results,"searchResults.txt"), sep="\t",row.names=F, col.names=T,quote=T) return(sum(Meta$PuMaQC.size>0,na.rm=T)) }