#!/usr/bin/env Rscript
###############################################################################
# #
# Copyright (c) 2013 J. Craig Venter Institute. #
# All rights reserved. #
# #
###############################################################################
# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see . #
# #
###############################################################################
###############################################################################
library('getopt');
params=c(
"input_file", "i", 1, "character",
"output_file", "o", 1, "character"
);
opt=getopt(spec=matrix(params, ncol=4, byrow=TRUE), debug=FALSE);
script_name=unlist(strsplit(commandArgs(FALSE)[4],"=")[1])[2];
usage = paste (
"\nUsage:\n\n", script_name,
"\n",
" -i \n",
" -o \n",
"\n",
"This script will generate PDF and txt files for exponential and power law models of Core, Novel, and Pan-genome gene sizes.\n",
"\n",
"\n");
if((!length(opt$input_file)) | (!length(opt$output_file))){
cat(usage);
q(status=-1);
}
###############################################################################
InputFileName=opt$input_file;
cat("\n")
cat("Input File Name: ", InputFileName, "\n\n");
OutputFileName=opt$output_file;
cat("\n")
cat("Output File Name: ", OutputFileName, "\n\n");
OutputFileNamePDF <- paste(OutputFileName, ".pdf", sep="")
OutputFileNameTXT <- paste(OutputFileName, ".txt", sep="")
###############################################################################
###############################################################################
pdf(OutputFileNamePDF)
sink(OutputFileNameTXT)
###Novel:Power:mean###
print("Novel Genes: Power Model: Means")
z <- read.delim(InputFileName, sep="\t", header=TRUE, colClasses="integer", check.names=FALSE, comment.char="", quote="")
znox<-z[,"Genome"]
min_genomes <- min(znox)
max_genomes <- max(znox)
znoy<-z[,"Novel"]
zno<-data.frame(Genome=znox,Novel=znoy)
y <- vector(mode="numeric", length=0)
x <- vector(mode="integer", length=0)
for (i in min_genomes:max_genomes) {
y <- c(y, mean(z[,"Novel"][z[,"Genome"]==i]))
x <- c(x, i)
}
nov_pwm_nlsfit<-nls(y~k*x^(-a), data=z, start=list(k=min_genomes*max(y),a=1),trace=TRUE)
summary(nov_pwm_nlsfit)
rsq.nls<-1-(sum((residuals(nov_pwm_nlsfit))^2)/sum((y-mean(y))^2))
rsq.nls
maxXlim <- 1.5 * max_genomes
maxYlim <- 2 * max(y)
plot (y ~ x, xlim=c(1,maxXlim), ylim=c(1,maxYlim), col="blue", lend="square", main="Novel Genes: Power Model: Means", xlab="# of genomes", ylab="# of new genes", pch=3, cex=0.5)
xf<-seq(0,maxXlim,length=maxXlim)
lines(xf,predict(nov_pwm_nlsfit,list(x=xf)),col="red", lwd=1)
#points(zno, cex=0.3)
###Novel:Exponential:mean###
print("Novel Genes: Exponential Model: Means")
nov_expm_nlsfit<-nls(y~a*exp(-x/b)+c, data=z, start=list(a=exp(1)*(max(y)-min(y)),b=min_genomes,c=min(y)),trace=TRUE)
summary(nov_expm_nlsfit)
rsq.nls<-1-(sum((residuals(nov_expm_nlsfit))^2)/sum((y-mean(y))^2))
rsq.nls
plot (y ~ x, xlim=c(1,maxXlim), ylim=c(1,maxYlim), col="blue", lend="square", main="Novel Genes: Exponential Model: Means", xlab="# of genomes", ylab="# of new genes", pch=3, cex=0.5)
xf<-seq(0,maxXlim,length=maxXlim)
lines(xf,predict(nov_expm_nlsfit,list(x=xf)),col="red", lwd=1)
#points(zno, cex=0.3)
###Novel:Power###
print("Novel Genes: Power Model: All")
no_nlsfit<-nls(znoy~k*znox^(-a), data=z, start=list(k=min_genomes*max(y),a=1),trace=TRUE)
summary(no_nlsfit)
rsq.nls<-sum((predict(no_nlsfit)-mean(znoy))^2)/sum((znoy-mean(znoy))^2)
rsq.nls
plot (znoy ~ znox, xlim=c(1,maxXlim), ylim=c(1,maxYlim), col="blue", lend="square", main="Novel Genes: Power Model: All", xlab="# of genomes", ylab="# of new genes", pch=3, cex=0.5)
znoxf<-seq(0,maxXlim,length=maxXlim)
lines(znoxf,predict(no_nlsfit,list(znox=znoxf)),col="red", lwd=1)
#points(zno, cex=0.3)
###Novel:Exponential###
print("Novel Genes: Exponential Model: All")
no_nlsfit<-nls(znoy~a*exp(-znox/b)+c, data=z, start=list(a=exp(1)*(max(y)-min(y)),b=min_genomes,c=min(y)),trace=TRUE)
summary(no_nlsfit)
rsq.nls<-sum((predict(no_nlsfit)-mean(znoy))^2)/sum((znoy-mean(znoy))^2)
rsq.nls
plot (znoy ~ znox, xlim=c(1,maxXlim), ylim=c(1,maxYlim), col="blue", lend="square", main="Novel Genes: Exponential Model: All", xlab="# of genomes", ylab="# of new genes", pch=3, cex=0.5)
znoxf<-seq(0,maxXlim,length=maxXlim)
lines(znoxf,predict(no_nlsfit,list(znox=znoxf)),col="red", lwd=1)
#points(zno, cex=0.3)
###Core:Power:mean###
print("Core Genes: Power Model: Means")
znoy<-z[,"Core"]
zno<-data.frame(Genome=znox,Core=znoy)
y <- vector(mode="numeric", length=0)
x <- vector(mode="integer", length=0)
for (i in min_genomes:max_genomes) {
y <- c(y, mean(z[,"Core"][z[,"Genome"]==i]))
x <- c(x, i)
}
cor_pwm_nlsfit<-nls(y~k*x^(-a), data=z, start=list(k=max(y),a=2/(min_genomes+max_genomes)),trace=TRUE)
summary(cor_pwm_nlsfit)
rsq.nls<-1-(sum((residuals(cor_pwm_nlsfit))^2)/sum((y-mean(y))^2))
rsq.nls
maxXlim <- 1.5 * max_genomes
maxYlim <- 2 * max(y)
minYlim <- 0.5 * min(y)
plot (y ~ x, xlim=c(1,maxXlim), ylim=c(minYlim, maxYlim), col="blue", lend="square", main="Core Genes: Power Model: Means", xlab="# of genomes", ylab="# of core genes", pch=3, cex=0.5)
xf<-seq(0,maxXlim,length=maxXlim)
lines(xf,predict(cor_pwm_nlsfit,list(x=xf)),col="red", lwd=1)
#points(zno, cex=0.3)
###Core:Exponential:mean###
print("Core Genes: Exponential Model: Means")
cor_expm_nlsfit<-nls(y~a*exp(-x/b)+c, data=z, start=list(a=exp(1)*(max(y)-min(y)),b=min_genomes,c=min(y)),trace=TRUE)
summary(cor_expm_nlsfit)
rsq.nls<-1-(sum((residuals(cor_expm_nlsfit))^2)/sum((y-mean(y))^2))
rsq.nls
maxXlim <- 1.5 * max_genomes
maxYlim <- 2 * max(y)
minYlim <- 0.5 * min(y)
plot (y ~ x, xlim=c(1,maxXlim), ylim=c(minYlim, maxYlim), col="blue", lend="square", main="Core Genes: Exponential Model: Means", xlab="# of genomes", ylab="# of core genes", pch=3, cex=0.5)
xf<-seq(0,maxXlim,length=maxXlim)
lines(xf,predict(cor_expm_nlsfit,list(x=xf)),col="red", lwd=1)
#points(zno, cex=0.3)
##Core:Power###
print("Core Genes: Power Model: All")
cor_pw_nlsfit<-nls(znoy~k*znox^(-a), data=z, start=list(k=max(y),a=2/(min_genomes+max_genomes)),trace=TRUE)
summary(cor_pw_nlsfit)
rsq.nls<-sum((predict(cor_pw_nlsfit)-mean(znoy))^2)/sum((znoy-mean(znoy))^2)
rsq.nls
plot (znoy ~ znox, xlim=c(1,maxXlim), ylim=c(minYlim,maxYlim), col="blue", lend="square", main="Core Genes: Power Model: All", xlab="# of genomes", ylab="# of core genes", pch=3, cex=0.5)
znoxf<-seq(0,maxXlim,length=maxXlim)
lines(znoxf,predict(cor_pw_nlsfit,list(znox=znoxf)),col="red", lwd=1)
#points(zno, cex=0.3)
##Core:Exponential###
print("Core Genes: Exponential Model: All")
cor_exp_nlsfit<-nls(znoy~a*exp(-znox/b)+c, data=z, start=list(a=exp(1)*(max(y)-min(y)),b=min_genomes,c=min(y)),trace=TRUE)
summary(cor_exp_nlsfit)
rsq.nls<-sum((predict(cor_exp_nlsfit)-mean(znoy))^2)/sum((znoy-mean(znoy))^2)
rsq.nls
plot (znoy ~ znox, xlim=c(1,maxXlim), ylim=c(minYlim,maxYlim), col="blue", lend="square", main="Core Genes: Exponential Model: All", xlab="# of genomes", ylab="# of core genes", pch=3, cex=0.5)
znoxf<-seq(0,maxXlim,length=maxXlim)
lines(znoxf,predict(cor_exp_nlsfit,list(znox=znoxf)),col="red", lwd=1)
#points(zno, cex=0.3)
###Pan_genome:Power:mean###
print("Pan-Genome: Power Model: Means")
znoy<-z[,"Pan_genome"]
zno<-data.frame(Genome=znox,Pan_genome=znoy)
y <- vector(mode="numeric", length=0)
x <- vector(mode="integer", length=0)
for (i in min_genomes:max_genomes) {
y <- c(y, mean(z[,"Pan_genome"][z[,"Genome"]==i]))
x <- c(x, i)
}
pan_pwm_nlsfit<-nls(y~k*x^(a), data=z, start=list(k=max(y)/min_genomes,a=1),trace=TRUE)
summary(pan_pwm_nlsfit)
rsq.nls<-1-(sum((residuals(pan_pwm_nlsfit))^2)/sum((y-mean(y))^2))
rsq.nls
maxXlim <- 1.5 * max_genomes
maxYlim <- 2 * max(y)
minYlim <- 0.5 * min(y)
plot (y ~ x, xlim=c(1,maxXlim), ylim=c(minYlim, maxYlim), col="blue", lend="square", main="Pan-Genome: Power Model: Means", xlab="# of genomes", ylab="Pan_genome size", pch=3, cex=0.5)
xf<-seq(0,maxXlim,length=maxXlim)
lines(xf,predict(pan_pwm_nlsfit,list(x=xf)),col="red", lwd=1)
#points(zno, cex=0.3)
###Pan_genome:Exponential:mean###
print("Pan-Genome: Exponential Model: Means")
pan_expm_nlsfit<-nls(y~-a*exp(-x/b)+c, data=z, start=list(a=max(y),b=(min_genomes+max_genomes)/2,c=max(y)+min(y)),trace=TRUE)
summary(pan_expm_nlsfit)
rsq.nls<-1-(sum((residuals(pan_expm_nlsfit))^2)/sum((y-mean(y))^2))
rsq.nls
maxXlim <- 1.5 * max_genomes
maxYlim <- 2 * max(y)
minYlim <- 0.5 * min(y)
plot (y ~ x, xlim=c(1,maxXlim), ylim=c(minYlim, maxYlim), col="blue", lend="square", main="Pan-Genome: Exponential Model: Means", xlab="# of genomes", ylab="Pan_genome size", pch=3, cex=0.5)
xf<-seq(0,maxXlim,length=maxXlim)
lines(xf,predict(pan_expm_nlsfit,list(x=xf)),col="red", lwd=1)
#points(zno, cex=0.3)
##Pan_genome:Power###
print("Pan-Genome: Power Model: All")
pan_pw_nlsfit<-nls(znoy~k*znox^(a), data=z, start=list(k=max(y)/min_genomes,a=1),trace=TRUE)
summary(pan_pw_nlsfit)
rsq.nls<-sum((predict(pan_pw_nlsfit)-mean(znoy))^2)/sum((znoy-mean(znoy))^2)
rsq.nls
plot (znoy ~ znox, xlim=c(1,maxXlim), ylim=c(minYlim,maxYlim), col="blue", lend="square", main="Pan-Genome: Power Model: All", xlab="# of genomes", ylab="Pan_genome size", pch=3, cex=0.5)
znoxf<-seq(0,maxXlim,length=maxXlim)
lines(znoxf,predict(pan_pw_nlsfit,list(znox=znoxf)),col="red", lwd=1)
#points(zno, cex=0.3)
##Pan_genome:Exponential###
print("Pan-Genome: Exponential Model: All")
pan_exp_nlsfit<-nls(znoy~-a*exp(-znox/b)+c, data=z, start=list(a=max(y),b=(min_genomes+max_genomes)/2,c=max(y)+min(y)),trace=TRUE)
summary(pan_exp_nlsfit)
rsq.nls<-sum((predict(pan_exp_nlsfit)-mean(znoy))^2)/sum((znoy-mean(znoy))^2)
rsq.nls
plot (znoy ~ znox, xlim=c(1,maxXlim), ylim=c(minYlim,maxYlim), col="blue", lend="square", main="Pan-Genome: Exponential Model: All", xlab="# of genomes", ylab="Pan_genome size", pch=3, cex=0.5)
znoxf<-seq(0,maxXlim,length=maxXlim)
lines(znoxf,predict(pan_exp_nlsfit,list(znox=znoxf)),col="red", lwd=1)
#points(zno, cex=0.3)
###Dispose:Power:mean###
print("Dispensable: Power Model: Means")
znoy<-z[,"Dispose"]
zno<-data.frame(Genome=znox,Dispose=znoy)
y <- vector(mode="numeric", length=0)
x <- vector(mode="integer", length=0)
for (i in min_genomes:max_genomes) {
y <- c(y, mean(z[,"Dispose"][z[,"Genome"]==i]))
x <- c(x, i)
}
pan_pwm_nlsfit<-nls(y~k*x^(a), data=z, start=list(k=max(y)/min_genomes,a=1),trace=TRUE)
summary(pan_pwm_nlsfit)
rsq.nls<-1-(sum((residuals(pan_pwm_nlsfit))^2)/sum((y-mean(y))^2))
rsq.nls
maxXlim <- 1.5 * max_genomes
maxYlim <- 2 * max(y)
minYlim <- 0.5 * min(y)
plot (y ~ x, xlim=c(1,maxXlim), ylim=c(minYlim, maxYlim), col="blue", lend="square", main="Dispensable: Power Model: Means", xlab="# of genomes", ylab="Dispensable size", pch=3, cex=0.5)
xf<-seq(0,maxXlim,length=maxXlim)
lines(xf,predict(pan_pwm_nlsfit,list(x=xf)),col="red", lwd=1)
#points(zno, cex=0.3)
###Dispose:Exponential:mean###
print("Dispensable: Exponential Model: Means")
pan_expm_nlsfit<-nls(y~-a*exp(-x/b)+c, data=z, start=list(a=max(y),b=(min_genomes+max_genomes)/2,c=max(y)+min(y)),trace=TRUE)
summary(pan_expm_nlsfit)
rsq.nls<-1-(sum((residuals(pan_expm_nlsfit))^2)/sum((y-mean(y))^2))
rsq.nls
maxXlim <- 1.5 * max_genomes
maxYlim <- 2 * max(y)
minYlim <- 0.5 * min(y)
plot (y ~ x, xlim=c(1,maxXlim), ylim=c(minYlim, maxYlim), col="blue", lend="square", main="Dispensable: Exponential Model: Means", xlab="# of genomes", ylab="Dispensable size", pch=3, cex=0.5)
xf<-seq(0,maxXlim,length=maxXlim)
lines(xf,predict(pan_expm_nlsfit,list(x=xf)),col="red", lwd=1)
#points(zno, cex=0.3)
##Dispose:Power###
print("Dispensable: Power Model: All")
pan_pw_nlsfit<-nls(znoy~k*znox^(a), data=z, start=list(k=max(y)/min_genomes,a=1),trace=TRUE)
summary(pan_pw_nlsfit)
rsq.nls<-sum((predict(pan_pw_nlsfit)-mean(znoy))^2)/sum((znoy-mean(znoy))^2)
rsq.nls
plot (znoy ~ znox, xlim=c(1,maxXlim), ylim=c(minYlim,maxYlim), col="blue", lend="square", main="Dispensable: Power Model: All", xlab="# of genomes", ylab="Dispensable size", pch=3, cex=0.5)
znoxf<-seq(0,maxXlim,length=maxXlim)
lines(znoxf,predict(pan_pw_nlsfit,list(znox=znoxf)),col="red", lwd=1)
#points(zno, cex=0.3)
##Dispose:Exponential###
print("Dispensable: Exponential Model: All")
pan_exp_nlsfit<-nls(znoy~-a*exp(-znox/b)+c, data=z, start=list(a=max(y),b=(min_genomes+max_genomes)/2,c=max(y)+min(y)),trace=TRUE)
summary(pan_exp_nlsfit)
rsq.nls<-sum((predict(pan_exp_nlsfit)-mean(znoy))^2)/sum((znoy-mean(znoy))^2)
rsq.nls
plot (znoy ~ znox, xlim=c(1,maxXlim), ylim=c(minYlim,maxYlim), col="blue", lend="square", main="Dispensable: Exponential Model: All", xlab="# of genomes", ylab="Dispensable size", pch=3, cex=0.5)
znoxf<-seq(0,maxXlim,length=maxXlim)
lines(znoxf,predict(pan_exp_nlsfit,list(znox=znoxf)),col="red", lwd=1)
#points(zno, cex=0.3)
###Unique:Power:mean###
print("Unique: Power Model: Means")
znoy<-z[,"Unique"]
zno<-data.frame(Genome=znox,Unique=znoy)
y <- vector(mode="numeric", length=0)
x <- vector(mode="integer", length=0)
for (i in min_genomes:max_genomes) {
y <- c(y, mean(z[,"Unique"][z[,"Genome"]==i]))
x <- c(x, i)
}
pan_pwm_nlsfit<-nls(y~k*x^(a), data=z, start=list(k=max(y)/min_genomes,a=1),trace=TRUE)
summary(pan_pwm_nlsfit)
rsq.nls<-1-(sum((residuals(pan_pwm_nlsfit))^2)/sum((y-mean(y))^2))
rsq.nls
maxXlim <- 1.5 * max_genomes
maxYlim <- 2 * max(y)
minYlim <- 0.5 * min(y)
plot (y ~ x, xlim=c(1,maxXlim), ylim=c(minYlim, maxYlim), col="blue", lend="square", main="Unique: Power Model: Means", xlab="# of genomes", ylab="Unique size", pch=3, cex=0.5)
xf<-seq(0,maxXlim,length=maxXlim)
lines(xf,predict(pan_pwm_nlsfit,list(x=xf)),col="red", lwd=1)
#points(zno, cex=0.3)
###Unique:Exponential:mean###
print("Unique: Exponential Model: Means")
pan_expm_nlsfit<-nls(y~-a*exp(-x/b)+c, data=z, start=list(a=max(y),b=(min_genomes+max_genomes)/2,c=max(y)+min(y)),trace=TRUE)
summary(pan_expm_nlsfit)
rsq.nls<-1-(sum((residuals(pan_expm_nlsfit))^2)/sum((y-mean(y))^2))
rsq.nls
maxXlim <- 1.5 * max_genomes
maxYlim <- 2 * max(y)
minYlim <- 0.5 * min(y)
plot (y ~ x, xlim=c(1,maxXlim), ylim=c(minYlim, maxYlim), col="blue", lend="square", main="Unique: Exponential Model: Means", xlab="# of genomes", ylab="Unique size", pch=3, cex=0.5)
xf<-seq(0,maxXlim,length=maxXlim)
lines(xf,predict(pan_expm_nlsfit,list(x=xf)),col="red", lwd=1)
#points(zno, cex=0.3)
##Unique:Power###
print("Unique: Power Model: All")
pan_pw_nlsfit<-nls(znoy~k*znox^(a), data=z, start=list(k=max(y)/min_genomes,a=1),trace=TRUE)
summary(pan_pw_nlsfit)
rsq.nls<-sum((predict(pan_pw_nlsfit)-mean(znoy))^2)/sum((znoy-mean(znoy))^2)
rsq.nls
plot (znoy ~ znox, xlim=c(1,maxXlim), ylim=c(minYlim,maxYlim), col="blue", lend="square", main="Unique: Power Model: All", xlab="# of genomes", ylab="Unique size", pch=3, cex=0.5)
znoxf<-seq(0,maxXlim,length=maxXlim)
lines(znoxf,predict(pan_pw_nlsfit,list(znox=znoxf)),col="red", lwd=1)
#points(zno, cex=0.3)
##Unique:Exponential###
print("Unique: Exponential Model: All")
pan_exp_nlsfit<-nls(znoy~-a*exp(-znox/b)+c, data=z, start=list(a=max(y),b=(min_genomes+max_genomes)/2,c=max(y)+min(y)),trace=TRUE)
summary(pan_exp_nlsfit)
rsq.nls<-sum((predict(pan_exp_nlsfit)-mean(znoy))^2)/sum((znoy-mean(znoy))^2)
rsq.nls
plot (znoy ~ znox, xlim=c(1,maxXlim), ylim=c(minYlim,maxYlim), col="blue", lend="square", main="Unique: Exponential Model: All", xlab="# of genomes", ylab="Unique size", pch=3, cex=0.5)
znoxf<-seq(0,maxXlim,length=maxXlim)
lines(znoxf,predict(pan_exp_nlsfit,list(znox=znoxf)),col="red", lwd=1)
#points(zno, cex=0.3)