#!/usr/bin/python

import cgi, cgitb
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Blast import NCBIXML, NCBIStandalone
from tempfile import NamedTemporaryFile

blast_exe = '/var/www/blast-2.2.18/bin/blastall'

cgitb.enable()
mask = "N" # Mask character
form = cgi.FieldStorage()
# Get sequence data from text area
seqs = form.getvalue("seqs")
# Check if the textarea is empty
if not seqs:
    # Since the textarea is empty, check the uploaded file
    seqs = form.getvalue("seqdatafile")

blast_db = form.getvalue("blastdb",'customdb')
if blast_db == 'customdb':
    db = '/var/www/blast/db/lauravect'
elif blast_db == 'ncbivector':
    db = '/var/www/blast/db/vector'
elif blast_db == 'plantmito':
    db = '/var/www/blast/db/plantmitogenomes'
else:
    # In case someone sends an unexpected string to 
    # the script, it defaults to the custom vector DB
    db = '/var/www/blast/db/lauravect'

def create_rel(XMLin):
    """ Create a dictionary that relate the sequence name 
    with the region to mask """
    bat1 = {}
    b_records = NCBIXML.parse(XMLin)
    for b_record in b_records:
        for alin in b_record.alignments:
            for hsp in alin.hsps:
                qs = hsp.query_start
                qe = hsp.query_end
                if qs>qe:
                    qe,qs=qs,qe
                if b_record.query not in bat1:
                    bat1[b_record.query] = [(qs,qe)]
                else:
                    bat1[b_record.query].append((qs,qe))
    return bat1

def maskseqs(ffh,bat1):
    """ Take a FASTA file and apply the mask using the 
        positions in the dictionary"""
    outseqs = []
    for record in SeqIO.parse(ffh, "fasta"):
        if record.id in bat1:
            # Generate a mutable sequence object to store
            # the sequence with the "mask".
            mutable_seq = record.seq.tomutable()
            coords = bat1[record.id]
            for x in coords:
                mutable_seq[x[0]:x[1]] = mask*(x[1]-x[0])
            seq_rec = SeqRecord(mutable_seq,record.id,'','')
            outseqs.append(seq_rec)
        else:
            # Leave the sequence as found when its name is 
            # not in the dictionary.
            outseqs.append(record)
    return outseqs

# Create a temporary file
fasta_in_fh = NamedTemporaryFile()
# Write the user entered sequence into this temporary file
fasta_in_fh.write(seqs)
# Flush the data to disk without closing and deleting the file,
# since that closing a temporary file also deletes it
fasta_in_fh.flush()
# Get the name of the temporary file
file_in = fasta_in_fh.name
# Run the BLAST query        
rh, eh = NCBIStandalone.blastall(blast_exe, "blastn", db, 
                                 file_in, expectation='1e-6')
# Create contamination position and store it in a dictionary
bat1 = create_rel(rh)
# Reset the pointer position to the begining of the file
fasta_in_fh.seek(0)
# Get the sequences masked
newseqs = maskseqs(fasta_in_fh,bat1)
# Close and delete the temporary file
fasta_in_fh.close()
# Creates a new temporary file
fasta_out_fh = NamedTemporaryFile()
# Write the masked sequence into this temporary file
SeqIO.write(newseqs,fasta_out_fh,'fasta')
# Reset the pointer position to the begining of the file
fasta_out_fh.seek(0)
# Read the file
finalout = fasta_out_fh.read()
# Close and delete the temporary file
fasta_out_fh.close()

print 'Content-type: text/html\n'
print """<html><head><title>Vector Filter Output</title></head>
<body>Filtered sequences:<br/><p></p><pre>%s</pre>
</body></html>"""%(finalout)

This code is part of the book "Python for Bioinformatics", by Sebastian Bassi (sbassi@genesdigitales.com). Return to home page.