From EdwardsLab
#
# Copyright (c) 2003-2008 Robert Edwards
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program If not it is available at http://www.theseed.org/LICENSE.TXT.
#
use strict;
use lib '/home/redwards/MIG/Modules';
use Rob;
my $rob=new Rob;
use FIG;
my $fig=new FIG;
my ($distf, $outputsz, $genome, $fastafile)=(undef, "30,000,000", undef, undef);
while (@ARGV)
{
my $t=shift;
if ($t eq "-d") {$distf=shift}
elsif ($t eq "-o") {$outputsz = shift}
elsif ($t eq "-g") {$genome=shift}
elsif ($t eq "-f") {$fastafile=shift}
}
unless ($genome || $fastafile)
{
print <<EOF;
Mimic a 454 run by chopping up a genome into shorter fragments with random starting positions.
If we have a distribution file, we can use that to choose the distribution of the sequences, else we will just make random number between 100-110 bp.
The distribution file should just have length of fragment and number of occurences
The output file size should be in bp, default is 30,000,000 mimic a full run of a 454 read
Either a seed genome id or a fasta file can be specified as the target DNA
-d <distribution file>
-o <output size (bp), default $outputsz> (you can include commas)
-g genome number to attack
-f fasta file
EOF
exit(-1);
}
$outputsz =~ s/\,//g;
my %dist;
if ($distf)
{
open(IN, $distf) || die "Can't open $distf";
my $tot; my %tempdist;
while (<IN>)
{
chomp;
my ($sz, $c)=split /\t/;
$tempdist{$sz}=$c;
$tot+=$c;
}
# convert the distribution to a range of fractions
my $last=0;
map {
$dist{$_}=[$last, $last+$tempdist{$_}/$tot];
$last=$dist{$_}->[1];
} sort {$a <=> $b} keys %tempdist;
}
else
{
my @sizes=(100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110);
my $last=0;
foreach my $x (@sizes)
{
$dist{$x}=[$last, $last+(1/scalar(@sizes))];
$last=$dist{$x}->[1];
}
}
my $contigs;
my $totallen;
if ($genome)
{
$contigs=$rob->read_fasta($FIG_Config::organisms."/$genome/contigs");
map {$totallen+=length($_)} values %$contigs;
print STDERR "Read ", scalar(keys %$contigs), " contigs for $genome, with $totallen bp\n";
}
elsif ($fastafile)
{
$contigs=$rob->read_fasta($fastafile);
map {$totallen+=length($_)} values %$contigs;
print STDERR "Read ", scalar(keys %$contigs), " contigs from $fastafile, with $totallen bp\n";
}
else {die "Either genome or fasta file must be specified"}
# how long is the sequence, and what fraction is each contig
my %contigdist;
{
my $last=0;
map {
$contigdist{$_}=[$last, $last+(length($contigs->{$_})/$totallen)];
$last=$contigdist{$_}->[1];
} keys %$contigs;
}
# now we have everything we need, we need to (1) randomly choose a contig, (2) randomly choose a start site, (3) randomly choose a length
my $totalbpoutput=0;
my $seqcount=0;
while ($totalbpoutput < $outputsz)
{
# randomly choose a contig
my $rand=rand();
my $contig;
foreach my $c (keys %contigdist)
{
if ($contigdist{$c}->[0] < $rand && $contigdist{$c}->[1] >= $rand) {$contig=$c; last}
}
unless ($contig)
{
map {print STDERR join ("\t", $_, $contigdist{$_}->[0], $contigdist{$_}->[1]), "\n"} keys %contigdist;
die "Can't get a contig for $rand"
}
# randomly choose a start site
my $startsite=int(rand(length($contigs->{$contig})));
# randomly choose a distribution
$rand=rand();
my $len=0;
foreach my $d (keys %dist)
{
if ($dist{$d}->[0] < $rand && $dist{$d}->[1] >= $rand) {$len=$d; last}
}
die "Can't get a length for $rand" unless ($len);
#print STDERR "Choose $len because $rand and ", $dist{$len}->[0], "\t", $dist{$len}->[1], "\n";
my $seq=substr($contigs->{$contig}, $startsite, $len);
$seqcount++;
$contig =~ s/\s.*$//;
print ">$seqcount [$contig $startsite $len]\n$seq\n";
$totalbpoutput+=$len;
}