#!/usr/bin/env Rscript
###############################################################################
# #
# Copyright (c) 2013 J. Craig Venter Institute. #
# All rights reserved. #
# #
###############################################################################
# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see . #
# #
###############################################################################
###############################################################################
#combn_sub is a function to take a combinatorial sampling of x choose m with a maximum nset of samples
#the code was found on the internet and was said to be a modification of the standard combn code - seems to work`
combn_sub <- function (x, m, nset = 1000, seed=123, simplify = TRUE, ...) {
# print("Entering combn_sub")
# print(x)
# print(m)
# print(nset)
# print(seed)
# print(simplify)
stopifnot(length(m) == 1L)
if (m < 0)
stop("m < 0", domain = NA)
if (is.numeric(x) && length(x) == 1L && x > 0 && trunc(x) ==
x)
x <- seq_len(x)
# print(x)
n <- length(x)
# print(n)
if (n < m)
stop("n < m", domain = NA)
m <- as.integer(m)
e <- 0
h <- m
a <- seq_len(m)
# print(a)
len.r <- length(r <- x[a] )
# print(len.r)
# print(r)
count <- as.integer(round(choose(n, m)))
if (is.na(count)){
count <- .Machine$integer.max
}
# print(count)
if( count < nset ) nset <- count
dim.use <- c(m, nset)
# print(dim.use)
##-----MOD 1: Change the output matrix size--------------
out <- matrix(r, nrow = len.r, ncol = nset, byrow = FALSE)
# print(out)
if (m > 0) {
if (count < (10 * nset)) {
# print("small")
i <- 1L
nmmp1 <- n - m + 1L
##----MOD 2: Select a subset of indices
set.seed(seed)
samp <- sort(sample( 1:count, nset ))
##----MOD 3: Start a counter.
counter <- 1L
while (a[1L] != nmmp1 ) {
#-----MOD 4: Whenever the counter matches an index in samp,
#a combination of row indices is produced and stored in the matrix `out`
if(samp[i] == counter){
out[, i] <- x[a]
if( i == nset ) break
i <- i + 1L
}
#iterate through all combinations
if (e < n - h) {
h <- 1L
e <- a[m]
j <- 1L
} else {
e <- a[m - h]
h <- h + 1L
j <- 1L:h
}
a[m - h + j] <- e + j
#-----Increase the counter by 1 for each iteration of the while-loop
counter <- counter + 1L
}
} else {
# print("large")
##-----MOD 5: For large count just sample twice as many as needed and remove duplicates
i <- 0L
extra <- 2L * nset
outr <- matrix(r, nrow = extra, ncol = len.r, byrow = TRUE)
# print(outr)
notDone = TRUE
while (notDone) {
while (i < extra) {
i <- i + 1L
outr[i,] <- sort(sample(x, m))
}
outr <- unique(outr)
i <- nrow(outr)
if (i >= nset) {
notDone = FALSE
outr <- outr[1:nset,]
}
}
out <- t(outr)
}
}
array(out, dim.use)
}
library("compiler")
combn_sub <- cmpfun(combn_sub)
library('getopt');
params=c(
"input_file", "i", 1, "character",
"output_file", "o", 1, "character",
"perc_core", "p", 2, "character", #percenage of genomes to be considered core
"perc_novel", "q", 2, "character", #percenage of genomes to be considered novel
"num_samples", "s", 1, "character" #maximum number of combinatorial samples to run
);
opt=getopt(spec=matrix(params, ncol=4, byrow=TRUE), debug=FALSE);
script_name=unlist(strsplit(commandArgs(FALSE)[4],"=")[1])[2];
usage = paste (
"\nUsage:\n\n", script_name,
"\n",
" -i \n",
" -o