#!/usr/bin/env python

# From BLAST XML to HTML. By Sebastian Bassi.
# Tested with BLASTN xml files from 2.2.16 to 2.2.18.
# BLASTN xml files < 2.2.16 are not properly formatted.

import os
from optparse import OptionParser

from Bio.Blast import NCBIXML

helpstr = '''XML2HTML converts a BLAST XML file into one or 
multiple HTML files. This version requires Biopython 1.45 
with CSV fixes or higher.

Author: Sebastian Bassi (sbassi@genesdigitales.com)
Thanks to Yoan Jacquemin for help in testing.
License: GPL 3.0 (http://www.gnu.org/licenses/gpl-3.0.txt)'''
usage = helpstr + '\n\nusage: %prog input_file [options]'
parser = OptionParser()
parser = OptionParser(usage=usage)
parser.add_option("-o", "--output", dest="o_dir", default='.',
                  help="name of the output directory")
parser.add_option("-v", '--descriptions', dest="desc_n", 
                  default=None, type="int", 
                  help="descriptions to keep in output file")
parser.add_option("-b", '--alignments', dest="align_n",
                  default=None, type="int", 
                  help="alignments keep in output file")
parser.add_option("-V", '--verbose', dest="verb",
                  action="store_true", default=False,
                  help="prints output filename(s)")
parser.add_option("-t", '--title', dest="title",
                  action="store_true", default=False,
                  help="use sequence title as filename")

def htmlhead(fr,oid,f_in,odir,rec):
    if oid is True:
        fo=open(os.path.join(odir,rec.query_id+'.html'),'w')
    else:
        fo=open(os.path.join(odir,rec.query+'.html'),'w')
    fo.write('''<HTML>
<TITLE>BLAST Search Results</TITLE>
<BODY BGCOLOR="#FFFFFF" LINK="#0000FF" \
VLINK="#660099" ALINK="#660099">
<!-- Generated from %s by XML2HTML (Sebastian Bassi) -->
<PRE>''' %(f_in))
    fo.write('<b>%s %s %s</b>' %(fr.application,fr.version,
                                 fr.date))
    fo.write('\n<b><a href="http://www.ncbi.nlm.nih.gov/entrez/\
query.fcgi?db=PubMed&cmd=Retrieve&list_uids=9254694&dopt=\
Citation">Reference</a>:</b>'+fr.reference.replace('~',' ')
+'\n')
    return fo

def htmlfoot(fo,fr):
    try:
        nldb = fr.num_letters_in_database
    except:
        nldb = fr._num_letters_in_database
    fo.write('''<PRE>
  Database: %s
  Number of letters in database: %s
  Number of sequences in database:  %s

Lambda     K      H
    %.2f    %.3f     %.2f

Matrix: %s matrix:%s %s
Gap Penalties: Existence: %s, Extension: %s
Number of Sequences: %s
Length of database: %s
</PRE>
</BODY>
</HTML>''' %(fr.database,fr.num_letters_in_database,
             fr.num_sequences_in_database,
             fr.ka_params[0],fr.ka_params[1],
             fr.ka_params[2],fr.application,
             fr.sc_match,fr.sc_mismatch,
             fr.gap_penalties[0],fr.gap_penalties[1],
             fr.num_sequences_in_database, nldb))
    fo.close()
    return None

def prettyalign(fo,q,qs,qe,m,s,ss,se):
    """ Format the alignment in slices of 60 characters
    """
    #fo=file handler
    #q query sequence
    #qs query_start (or query_from)
    #qe query_end (or query_to)
    #m match sequence
    #s, ss and se are the equivalent for subject/hit
    pos = 0
    qr=range(qs,qe-1,-1) if qs>qe else range(qs,qe+61,1) 
    qini=qs
    qend=qe
    sr=range(ss,se-1,-1) if ss>se else range(ss,ss+len(s),1)
    mx=max(len(str(qr[-1])),len(str(sr[-1])))
    q_desp=0
    s_desp=0
    if max(len(q),len(s))>=60:
        finant_u = (pos+1 if ss>se else pos-1)
        finant_d = (pos+1 if ss>se else pos-1)
        while pos<max(len(q)-(len(q)%60),len(s)-(len(s)%60)):
            q_desp += (q[pos:pos+60].count('-')
                       if '-' in q[pos:pos+60] else 0)
            s_desp += (s[pos:pos+60].count('-')
                       if '-' in s[pos:pos+60] else 0)
            fo.write('Query: %-*s %s %s\n'%
                     (mx,qr[finant_u-1 if ss>se else finant_u+1],
                      q[pos:pos+60],qr[pos+59-q_desp]))
            fo.write('       '+' '*mx+' '+m[pos:pos+60]+'\n')
            fo.write('Sbjct: %-*s %s %s\n\n'%
                     (mx,sr[finant_d-1 if ss>se else finant_d+1],
                      s[pos:pos+60],sr[pos+59-s_desp]))
            finant_u = pos+59-q_desp
            finant_d = pos+59-s_desp
            pos += 60
        if len(q)%60!=0:
            q_desp+=(q[pos:pos+60].count('-') if
                     '-' in q[pos:pos+60] else 0)
            s_desp+=(s[pos:pos+60].count('-') if
                     '-' in s[pos:pos+60] else 0)
            fo.write('Query: %-*s %s %s\n'%(mx,qr[pos-q_desp],
                     q[pos:pos+60],qend))
            fo.write('       '+' '*mx+' '+m[pos:pos+60]+'\n')
            fo.write('Sbjct: %-*s %s %s\n\n'%(mx,sr[pos-s_desp],
                     s[pos:pos+60],sr[-1]))
    else:
        fo.write('Query: %-*s %s %s\n'%(mx,qini,
                 q[pos:pos+60],qend))
        fo.write('       '+' '*mx+' '+m[pos:pos+60]+'\n')
        fo.write('Sbjct: %-*s %s %s\n\n'%(mx,sr[pos],
                 s[pos:pos+60],sr[-1]))
    return None    

def blastconv(rec,fo,odir,de=None,al=None,oid='T'):
    """ Get a blast record in XML en saves the same
        record in HTML
    """
    fo.write('\n<b>Query=</b> %s' %(rec.query))
    fo.write('\n         (%s letters)' %(rec.query_letters))
    fo.write('\n<b>Database:</b> %s' %(rec.database))
    try:
        fo.write('\n         %s sequences; %s total letters\n'
        %(fr.num_sequences_in_database,
          fr.num_letters_in_database))
    except:
        # For Biopython > 1.45
        fo.write('\n         %s sequences; %s total letters\n'
        %(fr.num_sequences_in_database,
          fr._num_letters_in_database))
    fo.write('''Searching................................done

<PRE>


''')
    fo.write(' '*65+'Score    E\n')
    fo.write('Sequences producing significant alignments:'+\
             ' '*22+'(bits) Value\n')
    for d in rec.descriptions[:de]:
        k=d.accession
        desc=d.title
        bs=d.bits
        sc=d.e
        if 'gi|' in k:
            m=k.index('gi|')+3
            gi=k[m:k[m:].index('|')+3]
            fo.write('<a href="http://www.ncbi.nlm.nih.gov\
/entrez/query.fcgi?cmd=Retrieve&db=Nucleotide&list_uids\
=%s&dopt=GenBank" >%s</a> %s <a href = #%s>%.1f</a>%s%s\n'
%(gi,k.replace('gi|'+gi+'|',''),desc,k,bs,4*" ",sc))
        else:
            fo.write('%s ... <a href = #%s>%.1f</a>%s%s\n'
            %(desc[:60],k,bs,4*" ",sc))
    fo.write('</PRE>\n')
    for alig in rec.alignments[:al]:
        fo.write('<PRE>\n')
        k=alig.hit_id
        desc=alig.hit_def
        if 'gi|' in k:
            m=k.index('gi|')+3
            gi=k[m:k[m:].index('|')+3]
            fo.write('><a name=%s></a><a href="http://\
www.ncbi.nlm.nih.gov/entrez/query.fcgi?\
cmd=Retrieve&db=Nucleotide&list_uids=%s&dopt=GenBank" >\
%s</a> %s \n'
%(alig.accession,gi,k.replace('gi|'+gi+'|',''),desc))
        else:
            fo.write('><a name=%s></a>%s'%(alig.accession,desc))
        fo.write('\n Length = %s\n' %(alig.length))
        # Walk over all the hsps
        for hsp in alig.hsps:
            bs=hsp.bits
            hsc=hsp.score
            sc=hsp.expect
            h_id=hsp.identities
            h_pos=hsp.positives
            h_alen=hsp.align_length
            q_frame=hsp.frame[0]
            try:
                h_frame=hsp.frame[1]
            except IndexError:
                h_frame=q_frame
            q_from=hsp.query_start
            q_to=hsp.query_end
            h_from=hsp.sbjct_start
            h_to=hsp.sbjct_end
            qseq=hsp.query
            hseq=hsp.sbjct
            mid=hsp.match
            qf='Plus' if q_frame>0 else 'Minus'
            hf='Plus' if h_frame>0 else 'Minus'
            fo.write('\n\nScore = %s bits (%s), Expect = %s\n'
                     %(bs,hsc,sc))
            fo.write('Identities = %s/%s (%.0f%%)\n'
                %(h_id,h_alen,float(int(h_id))/int(h_alen)*100))
            fo.write('Strand = %s/%s\n\n' %(qf,hf))
            prettyalign(fo,qseq,q_from,q_to,mid,hseq,h_from,h_to)
        fo.write('</PRE>\n')
    return fo

(opts, args) = parser.parse_args()
if len(args)<1:
    errmsg = "Bad or missing option in input."
    errmsg += " This program requires an input file"
    errmsg += " Please see the help with -h or --help"
    parser.error(errmsg)
else:
    title = opts.title
    desc = opts.desc_n
    align = opts.align_n
    for f in args:
        if opts.o_dir=='.':
            o_dir = ""
        else:
            o_dir = opts.o_dir
        fr = NCBIXML.parse(open(f)).next()
        for rec in NCBIXML.parse(open(f)):
            f_out = htmlhead(fr,title,f,o_dir,rec)
            f_out = blastconv(rec,f_out,o_dir,desc,align,title)
            htmlfoot(f_out,fr)
            if opts.verb is True:
                print(rec.query_id if title else rec.query+
                      '.html')