#!/usr/bin/env python

# From BLAST XML to HTML. By Sebastian Bassi.
# Tested with BLASTN xml files from 2.2.16 to 2.2.18.
# BLASTN xml files < 2.2.16 are not properly formatted.

import sys
import getopt
import os

from Bio.Blast import NCBIXML

def htmlhead(fr,oid,f_in,odir,rec):
    oid = oid.upper()
    if oid=='T':
        fo=open(odir+os.sep+rec.query_id+'.html','w')
    else:
        fo=open(odir+os.sep+rec.query+'.html','w')
    fo.write('''<HTML>
<TITLE>BLAST Search Results</TITLE>
<BODY BGCOLOR="#FFFFFF" LINK="#0000FF" \
VLINK="#660099" ALINK="#660099">
<!-- Generated from %s by XML2HTML (Sebastian Bassi) -->
<PRE>''' %(f_in))
    fo.write('<b>%s %s %s</b>' %(fr.application,fr.version,
                                 fr.date))
    fo.write('\n<b><a href="http://www.ncbi.nlm.nih.gov/entrez/\
query.fcgi?db=PubMed&cmd=Retrieve&list_uids=9254694&dopt=\
Citation">Reference</a>:</b>'+fr.reference.replace('~',' ')
+'\n')
    return fo

def htmlfoot(fo,fr):
    try:
        nldb = fr.num_letters_in_database
    except:
        nldb = fr._num_letters_in_database
    fo.write('''<PRE>
  Database: %s
  Number of letters in database: %s
  Number of sequences in database:  %s

Lambda     K      H
    %.2f    %.3f     %.2f

Matrix: %s matrix:%s %s
Gap Penalties: Existence: %s, Extension: %s
Number of Sequences: %s
Length of database: %s
</PRE>
</BODY>
</HTML>''' %(fr.database,fr.num_letters_in_database,
             fr.num_sequences_in_database,
             fr.ka_params[0],fr.ka_params[1],
             fr.ka_params[2],fr.application,
             fr.sc_match,fr.sc_mismatch,
             fr.gap_penalties[0],fr.gap_penalties[1],
             fr.num_sequences_in_database, nldb))
    fo.close()
    return None

def prettyalign(fo,q,qs,qe,m,s,ss,se):
    """ Format the alignment in slices of 60 characters
    """
    #fo=file handler
    #q query sequence
    #qs query_start (or query_from)
    #qe query_end (or query_to)
    #m match sequence
    #s, ss and se are the equivalent for subject/hit
    pos = 0
    qr=range(qs,qe-1,-1) if qs>qe else range(qs,qe+61,1) 
    qini=qs
    qend=qe
    sr=range(ss,se-1,-1) if ss>se else range(ss,ss+len(s),1)
    mx=max(len(str(qr[-1])),len(str(sr[-1])))
    q_desp=0
    s_desp=0
    if max(len(q),len(s))>=60:
        finant_u = (pos+1 if ss>se else pos-1)
        finant_d = (pos+1 if ss>se else pos-1)
        while pos<max(len(q)-(len(q)%60),len(s)-(len(s)%60)):
            q_desp += (q[pos:pos+60].count('-')
                       if '-' in q[pos:pos+60] else 0)
            s_desp += (s[pos:pos+60].count('-')
                       if '-' in s[pos:pos+60] else 0)
            fo.write('Query: %-*s %s %s\n'%
                     (mx,qr[finant_u-1 if ss>se else finant_u+1],
                      q[pos:pos+60],qr[pos+59-q_desp]))
            fo.write('       '+' '*mx+' '+m[pos:pos+60]+'\n')
            fo.write('Sbjct: %-*s %s %s\n\n'%
                     (mx,sr[finant_d-1 if ss>se else finant_d+1],
                      s[pos:pos+60],sr[pos+59-s_desp]))
            finant_u = pos+59-q_desp
            finant_d = pos+59-s_desp
            pos += 60
        if len(q)%60!=0:
            q_desp+=(q[pos:pos+60].count('-') if
                     '-' in q[pos:pos+60] else 0)
            s_desp+=(s[pos:pos+60].count('-') if
                     '-' in s[pos:pos+60] else 0)
            fo.write('Query: %-*s %s %s\n'%(mx,qr[pos-q_desp],
                     q[pos:pos+60],qend))
            fo.write('       '+' '*mx+' '+m[pos:pos+60]+'\n')
            fo.write('Sbjct: %-*s %s %s\n\n'%(mx,sr[pos-s_desp],
                     s[pos:pos+60],sr[-1]))
    else:
        fo.write('Query: %-*s %s %s\n'%(mx,qini,
                 q[pos:pos+60],qend))
        fo.write('       '+' '*mx+' '+m[pos:pos+60]+'\n')
        fo.write('Sbjct: %-*s %s %s\n\n'%(mx,sr[pos],
                 s[pos:pos+60],sr[-1]))
    return None    

def blastconv(rec,fo,odir,de=None,al=None,oid='T'):
    """ Get a blast record in XML en saves the same
        record in HTML
    """
    fo.write('\n<b>Query=</b> %s' %(rec.query))
    fo.write('\n         (%s letters)' %(rec.query_letters))
    fo.write('\n<b>Database:</b> %s' %(rec.database))
    try:
        fo.write('\n         %s sequences; %s total letters\n'
        %(fr.num_sequences_in_database,
          fr.num_letters_in_database))
    except:
        # For Biopython > 1.45
        fo.write('\n         %s sequences; %s total letters\n'
        %(fr.num_sequences_in_database,
          fr._num_letters_in_database))
    fo.write('''Searching................................done

<PRE>


''')
    fo.write(' '*65+'Score    E\n')
    fo.write('Sequences producing significant alignments:'+\
             ' '*22+'(bits) Value\n')
    for d in rec.descriptions[:de]:
        k=d.accession
        desc=d.title
        bs=d.bits
        sc=d.e
        if 'gi|' in k:
            m=k.index('gi|')+3
            gi=k[m:k[m:].index('|')+3]
            fo.write('<a href="http://www.ncbi.nlm.nih.gov\
/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids\
=%s&dopt=GenBank" >%s</a> %s <a href = #%s>%.1f</a>%s%s\n'
%(gi,k.replace('gi|'+gi+'|',''),desc,k,bs,4*" ",sc))
        else:
            fo.write('%s ... <a href = #%s>%.1f</a>%s%s\n'
            %(desc[:60],k,bs,4*" ",sc))
    fo.write('</PRE>\n')
    for alig in rec.alignments[:al]:
        fo.write('<PRE>\n')
        k=alig.hit_id
        desc=alig.hit_def
        if 'gi|' in k:
            m=k.index('gi|')+3
            gi=k[m:k[m:].index('|')+3]
            fo.write('><a name=%s></a><a href="http://\
www.ncbi.nlm.nih.gov/entrez/query.fcgi?\
cmd=Retrieve&db=Nucleotide&list_uids=%s&dopt=GenBank" >\
%s</a> %s \n'
%(alig.accession,gi,k.replace('gi|'+gi+'|',''),desc))
        else:
            fo.write('><a name=%s></a>%s'%(alig.accession,desc))
        fo.write('\n Length = %s\n' %(alig.length))
        # Walk over all the hsps
        for hsp in alig.hsps:
            bs=hsp.bits
            hsc=hsp.score
            sc=hsp.expect
            h_id=hsp.identities
            h_pos=hsp.positives
            h_alen=hsp.align_length
            q_frame=hsp.frame[0]
            try:
                h_frame=hsp.frame[1]
            except IndexError:
                h_frame=q_frame
            q_from=hsp.query_start
            q_to=hsp.query_end
            h_from=hsp.sbjct_start
            h_to=hsp.sbjct_end
            qseq=hsp.query
            hseq=hsp.sbjct
            mid=hsp.match
            qf='Plus' if q_frame>0 else 'Minus'
            hf='Plus' if h_frame>0 else 'Minus'
            fo.write('\n\nScore = %s bits (%s), Expect = %s\n'
                     %(bs,hsc,sc))
            fo.write('Identities = %s/%s (%.0f%%)\n'
                %(h_id,h_alen,float(int(h_id))/int(h_alen)*100))
            fo.write('Strand = %s/%s\n\n' %(qf,hf))
            prettyalign(fo,qseq,q_from,q_to,mid,hseq,h_from,h_to)
        fo.write('</PRE>\n')
    return fo

def program_help():
    print 'XML2HTML converts a BLAST XML file into one',
    print 'multiple HTML files.'
    print 'This version requires Biopython 1.45 with',
    print 'CSV fixes or better.'
    print 'Usage: ./XML2HTML -i xmlfile -o dir',
    print '[-v desc] [-b alig] [-t id]'
    print '''
XML2HTML arguments:

Required:

-i Input file [string]

Optional:

-o Output directory [string]
-t Use the sequence title as output filename [T or F, default=T]
-v Number of sequences to show one-line descriptions [integer]
-b Number of sequence to show alignments [integer]
-V Verbose mode

Example:

./XML2HTML -i seqs.xml -o /home/user/NT -v 50 -b 10

Author: Sebastian Bassi (sbassi@genesdigitales.com)
Thanks to Yoan Jacquemin for help in testing.
License: GPL 3.0 (http://www.gnu.org/licenses/gpl-3.0.txt)'''
    return None

try:
    opts, args = getopt.getopt(sys.argv[1:],
    "hVi:o:v:b:t:", ['help','input=', 'output=',
    'descriptions=','alignments=', 'title='])
except:
    print 'Bad or missing option. Please see the help:'
    program_help()
    sys.exit(2)
desc_n = None
alig_n = None
o_title = 'T'
verb = 'F'
for opt,arg in opts:
    if opt in ('-h', '--help'):
        program_help()
        sys.exit()
    elif opt in ('-i','--input'):
        f_in=arg
    elif opt in ('-o','--output'):
        o_dir=arg
    elif opt in ('-t','--title'):
        o_title=arg
    elif opt in ('-v','--descriptions'):
        desc_n=int(arg)
    elif opt in ('-b','--alignments'):
        alig_n=int(arg)
    elif opt in ('-V','--verbose'):
        verb='T'
if len(sys.argv)>2 and '-i' in sys.argv:
    if 'o_dir' not in dir():
        o_dir=f_in.rpartition(os.sep)[0]
    fr=NCBIXML.parse(open(f_in)).next()
    # fr is the first record, where 'Parameters' are stored. 
    for b_rec in NCBIXML.parse(open(f_in)):
        f_out = htmlhead(fr,f_in,o_title,o_dir,b_rec)
        f_out = blastconv(b_rec,f_out,o_dir,desc_n,alig_n,
                          o_title)
        htmlfoot(f_out,fr)
        if verb=='T':
            print (b_rec.query_id if o_title=='T' else \
                   b_rec.query)+'.html'
else:
    program_help()
    sys.exit(2)

This code is part of the book "Python for Bioinformatics", by Sebastian Bassi (sbassi@genesdigitales.com). Return to home page.