#!/usr/bin/env Rscript
###############################################################################
# #
# Copyright (c) 2013 J. Craig Venter Institute. #
# All rights reserved. #
# #
###############################################################################
# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see . #
# #
###############################################################################
###############################################################################
library('getopt');
params=c(
"input_file", "i", 1, "character",
"output_file", "o", 1, "character",
"verb_on", "v", 0, "binary"
);
opt=getopt(spec=matrix(params, ncol=4, byrow=TRUE), debug=FALSE);
script_name=unlist(strsplit(commandArgs(FALSE)[4],"=")[1])[2];
usage = paste (
"\nUsage:\n\n", script_name,
"\n",
" -i \n",
" -o \n",
" -v \n",
"\n",
"This script will generate PDF and txt files for exponential and power law models of Core, Novel, and Pan-genome gene sizes.\n",
"\n",
"\n");
if((!length(opt$input_file)) | (!length(opt$output_file))){
cat(usage);
q(status=-1);
}
###############################################################################
InputFileName=opt$input_file;
cat("\n")
cat("Input File Name: ", InputFileName, "\n\n");
OutputFileName=opt$output_file;
cat("\n")
OutputFileNamePDF <- paste(OutputFileName, ".pdf", sep="")
OutputFileNameTXT <- paste(OutputFileName, ".txt", sep="")
cat("PDF Output File Name: ", OutputFileNamePDF, "\n");
cat("TXT Output File Name: ", OutputFileNameTXT, "\n\n");
###############################################################################
###############################################################################
pdf(OutputFileNamePDF)
sink(OutputFileNameTXT)
# attempt to get more tick marks
par (lab = c(10, 10, 7))
###Novel:Power:mean###
# only plot medians - comment out means
z <- read.delim(InputFileName, sep="\t", header=TRUE, colClasses="integer", check.names=FALSE, comment.char="", quote="")
znox<-z[,"Genome"]
min_genomes <- min(znox)
max_genomes <- max(znox)
znoy<-z[,"Novel"]
min_novel <- min(znoy)
max_novel <- max(znoy)
#y <- vector(mode="numeric", length=0)
#x <- vector(mode="integer", length=0)
ymed <- vector(mode="numeric", length=0)
xmed <- vector(mode="integer", length=0)
for (i in min_genomes:max_genomes) {
# y <- c(y, mean(z[,"Novel"][z[,"Genome"]==i]))
# x <- c(x, i)
ymed <- c(ymed, median(z[,"Novel"][z[,"Genome"]==i]))
xmed <- c(xmed, i)
}
#print("Novel Genes : Power Model: Means")
#nov_pwm_nlsfit<-nls(y~k*x^(-a), start=list(k=min_genomes*max(y),a=1),trace=TRUE)
#summary(nov_pwm_nlsfit)
#rsq.nls<-1-(sum((residuals(nov_pwm_nlsfit))^2)/sum((y-mean(y))^2))
#rsq.nls
print("Novel Genes : Power Model: Medians")
nov_pwmed_nlsfit<-nls(ymed~k*xmed^(-a), start=list(k=min_genomes*max(ymed),a=1),trace=TRUE,control = nls.control(warnOnly=T, minFactor =0))
if (!is.null(opt$verb_on))
{
summary(nov_pwmed_nlsfit)
}
rsq.nls<-1-(sum((residuals(nov_pwmed_nlsfit))^2)/sum((ymed-mean(ymed))^2))
rsq.nls
maxXlim <- 4 * max_genomes
minYlim <- min_novel
if (minYlim < 1) { #for log-log plots cannot have this less than 1
minYlim <- 1
}
maxYlim <- max_novel
plot (znoy ~ znox, xlim=c(2,maxXlim), ylim=c(minYlim,maxYlim), log="xy", col="gray", lend="square", main="Novel Genes Medians: Power Red; Exponential Blue", xlab="# of genomes", ylab="# of new genes", pch=1, cex=0.25)
points(xmed, ymed, col="purple", pch=1, cex=0.5)
#points(x, y, col="purple", pch=5, cex=0.5)
xf<-seq(from=1, to=maxXlim, by=1)
xf
#dxf <- data.frame(x=xf)
dxfmed <- data.frame(xmed=xf)
#lines(xf, predict(nov_pwm_nlsfit, dxf), col="red", lwd=1, lty="dashed")
lines(xf, predict(nov_pwmed_nlsfit, dxfmed), col="red", lwd=1)
###Novel:Exponential:mean###
#print("Novel Genes: Exponential Model: Means")
#nov_expm_nlsfit<-nls(y~a*exp(-x/b)+c, start=list(a=exp(1)*(max(y)-min(y)),b=min_genomes,c=min(y)),trace=TRUE)
#summary(nov_expm_nlsfit)
#rsq.nls<-1-(sum((residuals(nov_expm_nlsfit))^2)/sum((y-mean(y))^2))
#rsq.nls
print("Novel Genes: Exponential Model: Medians")
nov_expmed_nlsfit<-nls(ymed~a*exp(-xmed/b)+c, start=list(a=exp(1)*(max(ymed)-min(ymed)),b=min_genomes,c=min(ymed)),trace=TRUE, )
if (!is.null(opt$verb_on)){
summary(nov_expmed_nlsfit)
}
rsq.nls<-1-(sum((residuals(nov_expmed_nlsfit))^2)/sum((ymed-mean(ymed))^2))
rsq.nls
#lines(xf, predict(nov_expm_nlsfit, dxf), col="blue", lwd=1, lty="dashed")
lines(xf, predict(nov_expmed_nlsfit, dxfmed), col="blue", lwd=1)
###Core:Power:mean###
znoy<-z[,"Core"]
min_core <- min(znoy)
max_core <- max(znoy)
#y <- vector(mode="numeric", length=0)
#x <- vector(mode="integer", length=0)
ymed <- vector(mode="numeric", length=0)
xmed <- vector(mode="integer", length=0)
for (i in min_genomes:max_genomes) {
# y <- c(y, mean(z[,"Core"][z[,"Genome"]==i]))
# x <- c(x, i)
ymed <- c(ymed, median(z[,"Core"][z[,"Genome"]==i]))
xmed <- c(xmed, i)
}
#print("Core Genes: Power Model: Means")
#cor_pwm_nlsfit<-nls(y~k*x^(-a), start=list(k=max(y),a=2/(min_genomes+max_genomes)),trace=TRUE, control = nls.control(warnOnly=T, minFactor =0))
#summary(cor_pwm_nlsfit)
#rsq.nls<-1-(sum((residuals(cor_pwm_nlsfit))^2)/sum((y-mean(y))^2))
#rsq.nls
print("Core Genes: Power Model: Medians")
cor_pwmed_nlsfit<-nls(ymed~k*xmed^(-a), start=list(k=max(ymed),a=2/(min_genomes+max_genomes)),trace=TRUE, control = nls.control(warnOnly=T, minFactor =0))
if (!is.null(opt$verb_on))
{
summary(cor_pwmed_nlsfit)
}
rsq.nls<-1-(sum((residuals(cor_pwmed_nlsfit))^2)/sum((ymed-mean(ymed))^2))
rsq.nls
maxYlim <- max_core
minYlim <- min_core
if (minYlim < 1) { #for log-log plots cannot have this less than 1
minYlim <- 1
}
plot (znoy ~ znox, xlim=c(2,maxXlim), ylim=c(minYlim, maxYlim), log="xy", col="gray", lend="square", main="Core Genes Medians: Power Red; Exponential Blue", xlab="# of genomes", ylab="# of core genes", pch=1, cex=0.25)
points(xmed, ymed, col="purple", pch=1, cex=0.5)
#points(x, y, col="purple", pch=5, cex=0.5)
#lines(xf,predict(cor_pwm_nlsfit, dxf),col="red", lwd=1, lty="dashed")
lines(xf,predict(cor_pwmed_nlsfit, dxfmed),col="red", lwd=1)
###Core:Exponential:mean###
#print("Core Genes: Exponential Model: Means")
#cor_expm_nlsfit<-nls(y~a*exp(-x/b)+c, start=list(a=exp(1)*(max(y)-min(y)),b=min_genomes,c=min(y)),trace=TRUE)
#summary(cor_expm_nlsfit)
#rsq.nls<-1-(sum((residuals(cor_expm_nlsfit))^2)/sum((y-mean(y))^2))
#rsq.nls
print("Core Genes: Exponential Model: Medians")
cor_expmed_nlsfit<-nls(ymed~a*exp(-xmed/b)+c, start=list(a=exp(1)*(max(ymed)-min(ymed)),b=min_genomes,c=min(ymed)),trace=TRUE, control = nls.control(warnOnly=T, minFactor =0))
if (!is.null(opt$verb_on))
{
summary(cor_expmed_nlsfit)
}
rsq.nls<-1-(sum((residuals(cor_expmed_nlsfit))^2)/sum((ymed-mean(ymed))^2))
rsq.nls
#lines(xf,predict(cor_expm_nlsfit, dxf), col="blue", lwd=1, lty="dashed")
lines(xf,predict(cor_expmed_nlsfit, dxfmed), col="blue", lwd=1)
###Pan_genome:Power:mean###
znoy<-z[,"Pan_genome"]
min_pan <- min(znoy)
max_pan <- max(znoy)
#y <- vector(mode="numeric", length=0)
#x <- vector(mode="integer", length=0)
ymed <- vector(mode="numeric", length=0)
xmed <- vector(mode="integer", length=0)
for (i in min_genomes:max_genomes) {
# y <- c(y, mean(z[,"Pan_genome"][z[,"Genome"]==i]))
# x <- c(x, i)
ymed <- c(ymed, median(z[,"Pan_genome"][z[,"Genome"]==i]))
xmed <- c(xmed, i)
}
#print("Pan-Genome: Power Model: Means")
#pan_pwm_nlsfit<-nls(y~k*x^(a), start=list(k=max(y)/min_genomes,a=1),trace=TRUE)
#summary(pan_pwm_nlsfit)
#rsq.nls<-1-(sum((residuals(pan_pwm_nlsfit))^2)/sum((y-mean(y))^2))
#rsq.nls
print("Pan-Genome: Power Model: Medians")
pan_pwmed_nlsfit<-nls(ymed~k*xmed^(a), start=list(k=max(ymed)/min_genomes,a=1),trace=TRUE, control = nls.control(warnOnly=T, minFactor =0))
if (!is.null(opt$verb_on))
{
summary(pan_pwmed_nlsfit)
}
rsq.nls<-1-(sum((residuals(pan_pwmed_nlsfit))^2)/sum((ymed-mean(ymed))^2))
rsq.nls
maxYlim <- 1.5 * max_pan
minYlim <- min_pan
if (minYlim < 1) { #for log-log plots cannot have this less than 1
minYlim <- 1
}
plot (znoy ~ znox, xlim=c(2,maxXlim), ylim=c(minYlim, maxYlim), log="xy", col="gray", lend="square", main="Pan-Genome Medians: Power Red; Exponential Blue", xlab="# of genomes", ylab="Pan_genome size", pch=1, cex=0.25)
points(xmed, ymed, col="purple", pch=1, cex=0.5)
#points(x, y, col="purple", pch=5, cex=0.5)
#lines(xf,predict(pan_pwm_nlsfit, dxf), col="red", lwd=1, lty="dashed")
lines(xf,predict(pan_pwmed_nlsfit, dxfmed), col="red", lwd=1)
###Pan_genome:Exponential:mean###
#print("Pan-Genome: Exponential Model: Means")
#pan_expm_nlsfit<-nls(y~-a*exp(-x/b)+c, start=list(a=max(y),b=(min_genomes+max_genomes)/2,c=max(y)+min(y)),trace=TRUE)
#summary(pan_expm_nlsfit)
#rsq.nls<-1-(sum((residuals(pan_expm_nlsfit))^2)/sum((y-mean(y))^2))
#rsq.nls
print("Pan-Genome: Exponential Model: Medians")
pan_expmed_nlsfit<-nls(ymed~-a*exp(-xmed/b)+c, start=list(a=max(ymed),b=(min_genomes+max_genomes)/2,c=max(ymed)+min(ymed)),trace=TRUE, control = nls.control(warnOnly=T, minFactor =0))
if (!is.null(opt$verb_on))
{
summary(pan_expmed_nlsfit)
}
rsq.nls<-1-(sum((residuals(pan_expmed_nlsfit))^2)/sum((ymed-mean(ymed))^2))
rsq.nls
#lines(xf,predict(pan_expm_nlsfit, dxf),col="blue", lwd=1, lty="dashed")
lines(xf,predict(pan_expmed_nlsfit, dxfmed),col="blue", lwd=1)
###Unique:Power:mean###
znoy<-z[,"Unique"]
min_uniq <- min(znoy)
max_uniq <- max(znoy)
#y <- vector(mode="numeric", length=0)
#x <- vector(mode="integer", length=0)
ymed <- vector(mode="numeric", length=0)
xmed <- vector(mode="integer", length=0)
if (min_genomes < 2) {
min_genomes <- 2 # unique genes don't make much sense for only 1 genome so messes up model fitting
}
for (i in min_genomes:max_genomes) {
# y <- c(y, mean(z[,"Unique"][z[,"Genome"]==i]))
# x <- c(x, i)
ymed <- c(ymed, median(z[,"Unique"][z[,"Genome"]==i]))
xmed <- c(xmed, i)
}
#print("Unique: Power Model: Means")
#pan_pwm_nlsfit<-nls(y~k*x^(a), start=list(k=max(y)/min_genomes,a=1),trace=TRUE)
#summary(pan_pwm_nlsfit)
#rsq.nls<-1-(sum((residuals(pan_pwm_nlsfit))^2)/sum((y-mean(y))^2))
#rsq.nls
print("Unique: Power Model: Medians")
pan_pwmed_nlsfit<-nls(ymed~k*xmed^(a), start=list(k=max(ymed)/min_genomes,a=1),trace=TRUE, control = nls.control(warnOnly=T, minFactor =0))
if (!is.null(opt$verb_on))
{
summary(pan_pwmed_nlsfit)
}
rsq.nls<-1-(sum((residuals(pan_pwmed_nlsfit))^2)/sum((ymed-mean(ymed))^2))
rsq.nls
maxYlim <- 1.5 * max_uniq
minYlim <- min_uniq
if (minYlim < 1) { #for log-log plots cannot have this less than 1
minYlim <- 1
}
plot (znoy ~ znox, xlim=c(2,maxXlim), ylim=c(minYlim, maxYlim), log="xy", col="gray", lend="square", main="Unique Medians: Power Red; Exponential Blue", xlab="# of genomes", ylab="Unique size", pch=1, cex=0.25)
points(xmed, ymed, col="purple", pch=1, cex=0.5)
#points(x, y, col="purple", pch=5, cex=0.5)
xf<-seq(from=2, to=maxXlim, by=1)
xf
#dxf <- data.frame(x=xf)
dxfmed <- data.frame(xmed=xf)
#lines(xf,predict(pan_pwm_nlsfit, dxf),col="red", lwd=1, lty="dashed")
lines(xf,predict(pan_pwmed_nlsfit, dxfmed),col="red", lwd=1)
###Unique:Exponential:mean###
#print("Unique: Exponential Model: Means")
#pan_expm_nlsfit<-nls(y~-a*exp(-x/b)+c, start=list(a=max(y),b=(min_genomes+max_genomes)/2,c=max(y)+min(y)),trace=TRUE)
#summary(pan_expm_nlsfit)
#rsq.nls<-1-(sum((residuals(pan_expm_nlsfit))^2)/sum((y-mean(y))^2))
#rsq.nls
print("Unique: Exponential Model: Medians")
pan_expmed_nlsfit<-nls(ymed~-a*exp(-xmed/b)+c, start=list(a=max(ymed),b=(min_genomes+max_genomes)/2,c=max(ymed)+min(ymed)),trace=TRUE, control = nls.control(warnOnly=T, minFactor =0))
if (!is.null(opt$verb_on))
{
summary(pan_expmed_nlsfit)
}
rsq.nls<-1-(sum((residuals(pan_expmed_nlsfit))^2)/sum((ymed-mean(ymed))^2))
rsq.nls
#lines(xf,predict(pan_expm_nlsfit, dxf),col="blue", lwd=1, lty="dashed")
lines(xf,predict(pan_expmed_nlsfit, dxfmed),col="blue", lwd=1)
###Dispose:Power:mean###
znoy<-z[,"Dispose"]
min_disp <- min(znoy)
max_disp <- max(znoy)
#y <- vector(mode="numeric", length=0)
#x <- vector(mode="integer", length=0)
ymed <- vector(mode="numeric", length=0)
xmed <- vector(mode="integer", length=0)
if (min_genomes < 3) {
min_genomes <- 3 # dispensible genes are always equal to 0 bydefinition for 1 or 2 genomes so messes up model fitting
}
for (i in min_genomes:max_genomes) {
# y <- c(y, mean(z[,"Dispose"][z[,"Genome"]==i]))
# x <- c(x, i)
ymed <- c(ymed, median(z[,"Dispose"][z[,"Genome"]==i]))
xmed <- c(xmed, i)
}
#print("Dispensable: Power Model: Means")
#pan_pwm_nlsfit<-nls(y~k*x^(a), start=list(k=max(y)/min_genomes,a=1),trace=TRUE)
#summary(pan_pwm_nlsfit)
#rsq.nls<-1-(sum((residuals(pan_pwm_nlsfit))^2)/sum((y-mean(y))^2))
#rsq.nls
print("Dispensable: Power Model: Medians")
pan_pwmed_nlsfit<-nls(ymed~k*xmed^(a), start=list(k=max(ymed)/min_genomes,a=1),trace=TRUE, control = nls.control(warnOnly=T, minFactor =0))
if (!is.null(opt$verb_on))
{
summary(pan_pwmed_nlsfit)
}
rsq.nls<-1-(sum((residuals(pan_pwmed_nlsfit))^2)/sum((ymed-mean(ymed))^2))
rsq.nls
maxYlim <- 1.5 * max_disp
minYlim <- min_disp
if (minYlim < 1) { #for log-log plots cannot have this less than 1
minYlim <- 1
}
plot (znoy ~ znox, xlim=c(3,maxXlim), ylim=c(minYlim, maxYlim), log="xy", col="gray", lend="square", main="Dispensable Medians: Power Red; Exponential Blue", xlab="# of genomes", ylab="Dispensable size", pch=1, cex=0.25)
points(xmed, ymed, col="purple", pch=1, cex=0.5)
#points(x, y, col="purple", pch=5, cex=0.5)
xf<-seq(from=3, to=maxXlim, by=1)
xf
#dxf <- data.frame(x=xf)
dxfmed <- data.frame(xmed=xf)
#lines(xf,predict(pan_pwm_nlsfit, dxf),col="red", lwd=1, lty="dashed")
lines(xf,predict(pan_pwmed_nlsfit, dxfmed),col="red", lwd=1)
###Dispose:Exponential:mean###
#print("Dispensable: Exponential Model: Means")
#pan_expm_nlsfit<-nls(y~-a*exp(-x/b)+c, start=list(a=max(y),b=(min_genomes+max_genomes)/2,c=max(y)+min(y)),trace=TRUE)
#summary(pan_expm_nlsfit)
#rsq.nls<-1-(sum((residuals(pan_expm_nlsfit))^2)/sum((y-mean(y))^2))
#rsq.nls
print("Dispensable: Exponential Model: Medians")
pan_expmed_nlsfit<-nls(ymed~-a*exp(-xmed/b)+c, start=list(a=max(ymed),b=(min_genomes+max_genomes)/2,c=max(ymed)+min(ymed)),trace=TRUE, control = nls.control(warnOnly=T, minFactor =0))
if (!is.null(opt$verb_on))
{
summary(pan_expmed_nlsfit)
}
rsq.nls<-1-(sum((residuals(pan_expmed_nlsfit))^2)/sum((ymed-mean(ymed))^2))
rsq.nls
#lines(xf,predict(pan_expm_nlsfit, dxf),col="blue", lwd=1, lty="dashed")
lines(xf,predict(pan_expmed_nlsfit, dxfmed),col="blue", lwd=1)