<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">use strict;

if(@ARGV == 0){
die &lt;&lt;USAGE
usage:
perl extract_data.pl &lt;chromDir&gt; &lt;bedfile&gt; &lt;chromcol&gt; &lt;poscol&gt; &lt;widthcol&gt; &lt;statcol&gt; &lt;outfile&gt;
    
    &lt;chromDir&gt;: the directory containing the chromosome FastAs (as 
                typically downloaded from UCSC as "chromFa"): one FastA per
                chromosome, one sequence per file
    &lt;bedfile&gt;:  the file containing the peaks in tabular format, 
                e.g., bed, gff, narrowPeak
    &lt;chromcol&gt;: the column of &lt;bedfile&gt; containing the chromosome
    &lt;poscol&gt;:   the column of &lt;bedfile&gt; containing the position relative to
                the chromosome start
    &lt;widthcol&gt;: either i) &lt;int&gt; the column of &lt;bedfile&gt; containing the width of 
                          the peak or any region to be extracted 
                          symmetrically around &lt;poscol&gt;
                or    ii) f&lt;int&gt; a fixed width of all regions, centered at the
                          position given in &lt;poscol&gt;, e.g., f100 for a fixed
                          width of 100 bp
    &lt;statcol&gt;:  the column of &lt;bedfile&gt; containing the peak statistic
                or a similar measure of confidence
    &lt;outfile&gt;:  the path to the output file, written as FastA
USAGE
}


my $chrom = $ARGV[0];
my $bed = $ARGV[1];
my $chromcol = $ARGV[2]-1;
my $poscol = $ARGV[3]-1;
my $widthcol = $ARGV[4];
my $statcol = $ARGV[5]-1;
my $outfile = $ARGV[6];

my $sort = 1;

my $width = 0;

if($widthcol =~ /f([0-9]+)/i){
	$width = $1;
}else{
	$widthcol --;
}


sub loadSeq{
	my $prefix = shift;
	print $prefix," ";
	open(FA,$chrom."/".$prefix.".fa");
	my $head = "";
	my @lines = ();
	while(&lt;FA&gt;){
		chomp();
		if(/^&gt;/){
			if($head){
				die "Chromosome FastA contained more than one sequence for ".$prefix."\n";
			}
			$head = $_;
		}else{
			push(@lines,lc($_));
		}
	}
	my $str = join("",@lines);
	print "loaded\n";
	return $str;
}



open(IN,$ARGV[1]);

my @lines = ();

while(&lt;IN&gt;){
	chomp();
	my @parts = split("\t",$_);
	$parts[$chromcol] =~ s/chr0/chr/g;
	my @vals = ($parts[$chromcol],$parts[$poscol],$parts[$statcol]);
	if($width){
		push(@vals,$width);
	}else{
		push(@vals,$parts[$widthcol]);
	}
	push(@lines,\@vals);
}

close(IN);
print "Read input file ".$bed."\n";


if($sort){

	@lines = sort { ${$a}[0] cmp ${$b}[0]  } @lines;

}

open(OUT,"&gt;".$outfile);

print "Extracting sequences...\n\n";

my $oldchr = "";
my $sequence = "";
for my $line (@lines){
	my @ar = @{$line};
	my $chr = $ar[0];
	unless($chr eq $oldchr){
		$sequence = loadSeq($chr);
	}
	$oldchr = $chr;
	my $w = $ar[3];
	if($w &lt;= 0){
		print $w," -&gt; next\n";
		next;
	}
	if($w % 2 == 0){
		$w = $w/2;
	}else{
		$w = ($w-1)/2;
	}

	my $start = $ar[1]-$w-1;

	my $head = "&gt; chr: ".$chr."; start: ".$start."; peak: ".($ar[1]-$start)."; signal: ".$ar[2]."\n";
	my $curr = substr($sequence,$start,$ar[3]);
	if($curr =~ /[^ACGTacgt]/){
		print "Sequence for\n\t",substr($head,1),"omitted due to ambiguous nucleotides.\n\n";
	}else{
		print OUT $head,$curr,"\n";
	}
}

close(OUT);
print "\nDone.\n";</pre></body></html>