#!/usr/bin/env python

# From BLAST XML to HTML. By Sebastian Bassi.
# Tested with BLASTN xml files from 2.2.16 to 2.2.18.
# BLASTN xml files < 2.2.16 are not properly formatted.

import sys
import getopt
import os

from Bio.Blast import NCBIXML

def htmlhead(fr,oid,f_in):
    fo = open(oid,'w')
    fo.write('''<HTML>
<TITLE>BLAST Search Results</TITLE>
<BODY BGCOLOR="#FFFFFF" LINK="#0000FF" \
VLINK="#660099" ALINK="#660099">
<!-- Generated from %s by XML2HTML (Sebastian Bassi) -->
<PRE>''' %(f_in))
    fo.write('<b>%s</b>'
    %(fr.application+' '+fr.version+' '+fr.date))
    fo.write('\n<b><a href="http://www.ncbi.nlm.nih.gov/entrez/\
query.fcgi?db=PubMed&cmd=Retrieve&list_uids=9254694&dopt=\
Citation">Reference</a>:</b>'+fr.reference.replace('~',' ')
+'\n')
    return fo

def htmlfoot(fo,fr):
    try:
        nldb = fr.num_letters_in_database
    except:
        nldb = fr._num_letters_in_database
    fo.write('''<PRE>
  Database: %s
  Number of letters in database: %s
  Number of sequences in database:  %s

Lambda     K      H
    %.2f    %.3f     %.2f

Matrix: %s matrix:%s %s
Gap Penalties: Existence: %s, Extension: %s
Number of Sequences: %s
Length of database: %s
</PRE>
</BODY>
</HTML>''' %(fr.database,fr.num_letters_in_database,
             fr.num_sequences_in_database,
             fr.ka_params[0],fr.ka_params[1],
             fr.ka_params[2],fr.application,
             fr.sc_match,fr.sc_mismatch,
             fr.gap_penalties[0],fr.gap_penalties[1],
             fr.num_sequences_in_database,
             nldb))
    fo.close()
    return None

def prettyalign(fo,q,qs,qe,m,s,ss,se):
    """ Format the alignment in slices of 60 characters
    """
    #fo file handler
    #q query sequence
    #qs query_start (or query_from)
    #qe query_end (or query_to)
    #m match sequence
    #s, ss and se are the equivalent for subject/hit
    pos = 0
    qr = range(qs,qe-1,-1) if qs>qe else range(qs,qe+61,1)
    qini = qs
    qend = qe
    sr=range(ss,se-1,-1) if ss>se else range(ss,ss+len(s),1)
    mx = max(len(str(qr[-1])),len(str(sr[-1])))
    q_desp = 0
    s_desp = 0
    if max(len(q),len(s))>=60:
        finant_u = (pos+1 if ss>se else pos-1)
        finant_d = (pos+1 if ss>se else pos-1)
        while pos<max(len(q)-(len(q)%60),len(s)-(len(s)%60)):
            q_desp += (q[pos:pos+60].count('-')
                       if '-' in q[pos:pos+60] else 0)
            s_desp += (s[pos:pos+60].count('-')
                       if '-' in s[pos:pos+60] else 0)
            fo.write('Query: %-*s %s %s\n'%
                     (mx,qr[finant_u-1 if ss>se else finant_u+1],
                      q[pos:pos+60],qr[pos+59-q_desp]))
            fo.write('       '+' '*mx+' '+m[pos:pos+60]+'\n')
            fo.write('Sbjct: %-*s %s %s\n\n'%
                     (mx,sr[finant_d-1 if ss>se else finant_d+1],
                      s[pos:pos+60],sr[pos+59-s_desp]))
            finant_u = pos+59-q_desp
            finant_d = pos+59-s_desp
            pos += 60
        if len(q)%60!=0:
            q_desp += (q[pos:pos+60].count('-') if
                       '-' in q[pos:pos+60] else 0)
            s_desp += (s[pos:pos+60].count('-') if
                       '-' in s[pos:pos+60] else 0)
            fo.write('Query: %-*s %s %s\n'%(mx,qr[pos-q_desp],
                     q[pos:pos+60],qend))
            fo.write('       '+' '*mx+' '+m[pos:pos+60]+'\n')
            fo.write('Sbjct: %-*s %s %s\n\n'%(mx,sr[pos-s_desp],
                     s[pos:pos+60],sr[-1]))
    else:
        fo.write('Query: %-*s %s %s\n'%(mx,qini,
                 q[pos:pos+60],qend))
        fo.write('       '+' '*mx+' '+m[pos:pos+60]+'\n')
        fo.write('Sbjct: %-*s %s %s\n\n'%(mx,sr[pos],
                 s[pos:pos+60],sr[-1]))
    return None    

def blastconv(rec,fo,de=None,al=None):
    """ Get a blast record in XML en saves the same
        record in HTML
    """
    fo.write('\n<b>Query=</b> %s' %(rec.query))
    fo.write('\n         (%s letters)' %(rec.query_letters))
    fo.write('\n<b>Database:</b> %s' %(rec.database))
    try:
        fo.write('\n         %s sequences; %s total letters\n'
        %(fr.num_sequences_in_database,
          fr.num_letters_in_database))
    except:
        # For Biopython > 1.45
        fo.write('\n         %s sequences; %s total letters\n'
        %(fr.num_sequences_in_database,
          fr._num_letters_in_database))
    fo.write('''Searching...................................\
...done
<PRE>


                                                             \
Score    E
Sequences producing significant alignments:                  \
(bits) Value

''')
    for d in rec.descriptions[:de]:
        k = d.accession
        desc = d.title
        bs = d.bits
        sc = d.e
        if 'gi|' in k:
            m = k.index('gi|')+3
            gi = k[m:k[m:].index('|')+3]
            fo.write('<a href="http://www.ncbi.nlm.nih.gov\
/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids\
=%s&dopt=GenBank" >%s</a> %s <a href = #%s>%.1f</a>%s%s\n'
%(gi,k.replace('gi|'+gi+'|',''),desc,k,bs,4*" ",sc))
        else:
            fo.write('%s ... <a href = #%s>%.1f</a>%s%s\n'
            %(desc[:60],k,bs,4*" ",sc))
    fo.write('</PRE>\n')
    for alig in rec.alignments[:al]:
        fo.write('<PRE>\n')
        k = alig.hit_id
        desc = alig.hit_def
        if 'gi|' in k:
            m = k.index('gi|')+3
            gi = k[m:k[m:].index('|')+3]
            fo.write('><a name=%s></a><a href="http://\
www.ncbi.nlm.nih.gov/entrez/query.fcgi?\
cmd=Retrieve&db=Nucleotide&list_uids=%s&dopt=GenBank" >\
%s</a> %s \n' %(alig.accession,gi,k.replace('gi|'+gi+'|',''),
                desc))
        else:
            fo.write('><a name=%s></a>%s'%(alig.accession,desc))
        fo.write('\n Length = %s\n' %(alig.length))
        # Walk over all the hsps
        for hsp in alig.hsps:
            bs = hsp.bits
            hsc = hsp.score
            sc = hsp.expect
            h_id = hsp.identities
            h_pos = hsp.positives
            h_alen = hsp.align_length
            q_frame = hsp.frame[0]
            try:
                h_frame = hsp.frame[1]
            except IndexError:
                h_frame = q_frame
            q_from = hsp.query_start
            q_to = hsp.query_end
            h_from = hsp.sbjct_start
            h_to = hsp.sbjct_end
            qseq = hsp.query
            hseq = hsp.sbjct
            mid = hsp.match
            qf = 'Plus' if q_frame>0 else 'Minus'
            hf = 'Plus' if h_frame>0 else 'Minus'
            fo.write('\n\nScore = %s bits (%s), Expect = %s\n'
                     %(bs,hsc,sc))
            fo.write('Identities = %s/%s (%.0f%%)\n'
                %(h_id,h_alen,float(int(h_id))/int(h_alen)*100))
            fo.write('Strand = %s/%s\n\n' %(qf,hf))
            prettyalign(fo,qseq,q_from,q_to,mid,hseq,h_from,h_to)
        fo.write('</PRE>\n')
    return fo

def program_help():
    print 'XML2HTML converts a BLAST XML file into one',
    print 'multiple HTML files.'
    print 'This version requires Biopython 1.45 with',
    print 'CSV fixes or higher.'
    print 'Usage: ./XML2HTML -i xmlfile [-o dir]',
    print '[-v desc] [-b alig] [-V]'
    print '''
XML2HTML arguments:

Required:

-i Input file [string]

Optional:

-o Output filename [string]
-v Number of sequences to show one-line descriptions [integer]
-b Number of sequence to show alignments [integer]
-V Verbose mode

Example:

./XML2HTML -i seqs.xml -o seqs.html -v 50 -b 10

Author: Sebastian Bassi (sbassi@genesdigitales.com)
Thanks to Yoan Jacquemin for help in testing.
License: GPL 3.0 (http://www.gnu.org/licenses/gpl-3.0.txt)'''
    return None

try:
    opts, args = getopt.getopt(sys.argv[1:],
    "hVi:o:v:b:t:", ['help','input=', 'output=',
    'descriptions=','alignments=', 'title='])
except:
    print 'Bad or missing option. Please see the help:'
    program_help()
    sys.exit(2)
desc_n = None
alig_n = None
o_title = 'T'
verb = 'F'
for opt,arg in opts:
    if opt in ('-h', '--help'):
        program_help()
        sys.exit()
    elif opt in ('-i','--input'):
        f_in=arg
    elif opt in ('-o','--output'):
        o_file=arg
    elif opt in ('-v','--descriptions'):
        desc_n=int(arg)
    elif opt in ('-b','--alignments'):
        alig_n=int(arg)
    elif opt in ('-V','--verbose'):
        verb='T'
if len(sys.argv)>2 and '-i' in sys.argv:
    if 'o_file' not in dir():
        o_file = f_in[:-3]+'html'
    # fr is the first record, where 'Parameters' are stored.
    fr = NCBIXML.parse(open(f_in)).next()
    f_out = htmlhead(fr,o_file,f_in)
    for b_rec in NCBIXML.parse(open(f_in)):
        f_out = blastconv(b_rec,f_out,desc_n,alig_n)
    htmlfoot(f_out,fr)
    if verb=='T':
        print o_file
else:
    program_help()
    sys.exit(2) 

This code is part of the book "Python for Bioinformatics", by Sebastian Bassi (sbassi@genesdigitales.com). Return to home page.