Chop genome to mimic 454.pl

From EdwardsLab

Jump to: navigation, search


#
# Copyright (c) 2003-2008 Robert Edwards
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program If not it is available at http://www.theseed.org/LICENSE.TXT.
#


use strict;
use lib '/home/redwards/MIG/Modules';
use Rob;
my $rob=new Rob;
use FIG;
my $fig=new FIG;


my ($distf, $outputsz, $genome, $fastafile)=(undef, "30,000,000", undef, undef);
while (@ARGV)
{
	my $t=shift;
	if ($t eq "-d") {$distf=shift}
	elsif ($t eq "-o") {$outputsz = shift}
	elsif ($t eq "-g") {$genome=shift}
	elsif ($t eq "-f") {$fastafile=shift}
}

unless ($genome || $fastafile)
{
	print <<EOF;
Mimic a 454 run by chopping up a genome into shorter fragments with random starting positions.
If we have a distribution file, we can use that to choose the distribution of the sequences, else we will just make random number between 100-110 bp.

The distribution file should just have length of fragment and number of occurences

The output file size should be in bp, default is 30,000,000 mimic a full run of a 454 read

Either a seed genome id or a fasta file can be specified as the target DNA

-d <distribution file>
-o <output size (bp), default $outputsz> (you can include commas)
-g genome number to attack
-f fasta file

EOF
exit(-1);
}

$outputsz =~ s/\,//g;

my %dist;
if ($distf)
{
	open(IN, $distf) || die "Can't open $distf";
	my $tot; my %tempdist;
	while (<IN>)
	{
		chomp;
		my ($sz, $c)=split /\t/;
		$tempdist{$sz}=$c;
		$tot+=$c;
	}
# convert the distribution to a range of fractions
	my $last=0;
	map {
		$dist{$_}=[$last, $last+$tempdist{$_}/$tot];
		$last=$dist{$_}->[1];
	} sort {$a <=> $b} keys %tempdist;
}
else
{
	my @sizes=(100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110);
	my $last=0;
	foreach my $x (@sizes)
	{
		$dist{$x}=[$last, $last+(1/scalar(@sizes))];
		$last=$dist{$x}->[1];
	}
}

my $contigs;
my $totallen;
if ($genome)
{
	$contigs=$rob->read_fasta($FIG_Config::organisms."/$genome/contigs");
	map {$totallen+=length($_)} values %$contigs;
	print STDERR "Read ", scalar(keys %$contigs), " contigs for $genome, with $totallen bp\n";
}
elsif ($fastafile) 
{
	$contigs=$rob->read_fasta($fastafile);
	map {$totallen+=length($_)} values %$contigs;
	print STDERR "Read ", scalar(keys %$contigs), " contigs from $fastafile, with $totallen bp\n";
}
else {die "Either genome or fasta file must be specified"}

# how long is the sequence, and what fraction is each contig
my %contigdist;
{
	my $last=0;
	map {
		$contigdist{$_}=[$last, $last+(length($contigs->{$_})/$totallen)];
		$last=$contigdist{$_}->[1];
	} keys %$contigs;
}

# now we have everything we need, we need to (1) randomly choose a contig, (2) randomly choose a start site, (3) randomly choose a length
my $totalbpoutput=0;
my $seqcount=0;
while ($totalbpoutput < $outputsz)
{
# randomly choose a contig
	my $rand=rand();
	my $contig;
	foreach my $c (keys %contigdist)
	{
		if ($contigdist{$c}->[0] < $rand && $contigdist{$c}->[1] >= $rand) {$contig=$c; last}
	}
	
	unless ($contig)
	{
		map {print STDERR join ("\t", $_, $contigdist{$_}->[0], $contigdist{$_}->[1]), "\n"} keys %contigdist;
		die "Can't get a contig for $rand"
	}

# randomly choose a start site
	my $startsite=int(rand(length($contigs->{$contig})));

# randomly choose a distribution
	$rand=rand();
	my $len=0;
	foreach my $d (keys %dist)
	{
		if ($dist{$d}->[0] < $rand && $dist{$d}->[1] >= $rand) {$len=$d; last}
	}
	die "Can't get a length for $rand" unless ($len);
	#print STDERR "Choose $len because $rand and ", $dist{$len}->[0], "\t", $dist{$len}->[1], "\n";
	my $seq=substr($contigs->{$contig}, $startsite, $len);
	$seqcount++;
	$contig =~ s/\s.*$//;
	print ">$seqcount [$contig $startsite $len]\n$seq\n";
	$totalbpoutput+=$len;
}


Personal tools
peoples pages