Source code for ipypublish.preprocessors.latextags_to_html

import logging
import re
import string
import io

import bibtexparser
import traitlets as traits
from nbconvert.preprocessors import Preprocessor

from six import string_types


[docs]class DefaultFormatter(string.Formatter): def __init__(self, default=''): self.default = default
[docs] def get_value(self, key, args, kwds): if isinstance(key, string_types): return kwds.get(key, self.default.format(key)) else: string.Formatter.get_value(key, args, kwds)
[docs]def safe_str(obj): if hasattr(obj, "decode"): try: obj = obj.decode("utf-8") except UnicodeEncodeError: pass try: return str(obj) except UnicodeEncodeError: # python 2.7 obj = re.sub(u"\u2013", "-", obj) # en dash obj = re.sub(u"\u2014", "--", obj) # em dash return obj.encode('ascii', 'ignore').decode('ascii') return ""
[docs]class LatexTagsToHTML(Preprocessor): r""" a preprocessor to find latex tags (like \cite{abc} or \todo[color]{stuff}) and: 1. attempt to process them into a html friendly format 2. remove them entirely if this is not possible for \ref or \cref, attempts to use resources.refmap to map labels to reference names for labels not found in resources.refmap the reference name is '<name> <number>', where; - <name> is either ref of, if labelbycolon is True and the label has a colon, all text before the colon - <number> iterate by order of first appearance of a particular label NB: should be applied after LatexDocHTML, if you want resources.refmap to be available Examples -------- >>> from nbformat import NotebookNode >>> from jsonextended.utils import MockPath >>> processor = LatexTagsToHTML() >>> bibfile = MockPath(is_file=True,content=''' ... @article{bibkey, ... title = {the title}, ... doi = {10.1134/S0018143916050209}, ... author = {Surname, A. Name}, ... date = {2016-09-01}, ... } ... ''') >>> resources = NotebookNode({'bibliopath':bibfile, 'refmap':{"label":"label_name"}}) >>> cell = NotebookNode({ ... "cell_type":"markdown", ... "metadata":{}, ... "source":"test" ... }) >>> nb = NotebookNode({"cells":[cell]}) >>> nb, _ = processor.preprocess(nb,resources) >>> print(nb.cells[0].source) test >>> cell.source = "\\unknown{test}" >>> nb, _ = processor.preprocess(nb,resources) >>> print(nb.cells[0].source) <BLANKLINE> >>> cell.source = "\\ref{label}\\unknown{test}" >>> nb, _ = processor.preprocess(nb,resources) >>> print(nb.cells[0].source) <a href="{id_home_prefix}label">label_name</a> >>> cell.source = "\\label{test}" >>> nb, _ = processor.preprocess(nb,resources) >>> print(nb.cells[0].source) <a id="test" class="anchor-link" name="#test">&#182;</a> >>> cell.source = "\\cite{bibkey}" >>> nb, _ = processor.preprocess(nb,resources) >>> print(nb.cells[0].source) [<a href="https://doi.org/10.1134/S0018143916050209">Surname <em>et al</em>, 2016.</a>] >>> cell.source = "\\begin{equation}x=a+b\\end{equation}" >>> nb, _ = processor.preprocess(nb,resources) >>> print(nb.cells[0].source) \begin{equation}x=a+b\end{equation} """ regex = traits.Unicode(r"\\(?:[^a-zA-Z]|[a-zA-Z]+[*=']?)(?:\[.*?\])?{.*?}", help="the regex to identify latex tags").tag(config=True) bibformat = traits.Unicode( "{author}, {year}.", help="the format to output \\cite{} tags found in the bibliography" ).tag(config=True) labelbycolon = traits.Bool( True, help='create reference label based on text before colon, e.g. \\ref{fig:example} -> fig 1' ).tag(config=True) def __init__(self, *args, **kwargs): # a dictionary to keep track of references, # so they each get a different number self.refs = {} # bibliography references self.bibdatabase = {} super(LatexTagsToHTML, self).__init__(*args, **kwargs)
[docs] @staticmethod def read_bibliography(path): """ read a bibliography """ logging.info('reading bibliopath: {}'.format(path)) bibdatabase = {} bibparser = bibtexparser.bparser.BibTexParser() try: if hasattr(path, 'open'): with path.open(encoding="utf8") as bibtex_file: bibtex_data = bibtex_file.read() else: with io.open(path, encoding="utf8") as bibtex_file: bibtex_data = bibtex_file.read() bibtex_data = safe_str(bibtex_data) bibdatabase = bibparser.parse(bibtex_data).entries_dict except Exception as err: logging.error('could not read bibliopath {}: {}'.format(path, err)) return bibdatabase
[docs] def rreplace(self, source, target, replacement, replacements=1): """replace in string, from right-to-left""" return replacement.join(source.rsplit(target, replacements))
[docs] def process_bib_entry(self, entry): """work out the best way to represent the bib entry """ # abbreviate a list of authors if 'author' in entry: authors = re.split(", | and ", entry['author']) if len(authors) > 1: author = authors[0] + ' <em>et al</em>' else: author = authors[0] entry['author'] = author # split up date into year, month, day if 'date' in entry: date = entry['date'].split('-') if len(date) == 3: entry['year'] = date[0] entry['month'] = date[1] entry['day'] = date[2] else: entry['year'] = date[0] text = DefaultFormatter().format(self.bibformat, **entry) if 'doi' in entry: return r'<a href="https://doi.org/{doi}">{text}</a>'.format( doi=entry['doi'], text=text) elif 'url' in entry: return r'<a href="{url}">{text}</a>'.format( url=entry['url'], text=text) elif 'link' in entry: return r'<a href="{url}">{text}</a>'.format( url=entry['link'], text=text) else: return text
[docs] def replace_reflabel(self, name, resources): """ find a suitable html replacement for a reference label the links are left with a format hook in them: {id_home_prefix}, so that an nbconvert filter can later replace it this is particularly useful for slides, which require a prefix #/<slide_number><label> """ if 'refmap' in resources: if name in resources['refmap']: return r'<a href="{{id_home_prefix}}{0}">{1}</a>'.format( name, resources['refmap'][name]) if self.labelbycolon: ref_name = name.split(':')[0] if ':' in name else 'ref' else: ref_name = 'ref' if not ref_name in self.refs: self.refs[ref_name] = {} refs = self.refs[ref_name] if name in refs: id = refs[name] else: id = len(refs) + 1 refs[name] = id return r'<a href="{{id_home_prefix}}{0}">{1}. {2}</a>'.format( name, ref_name, id)
[docs] def convert(self, source, resources): """ convert a a string with tags in Example ------- >>> source = r''' ... References to \\cref{fig:example}, \\cref{tbl:example}, \\cref{eqn:example_sympy} and \\cref{code:example_mpl}. ... ... Referencing multiple items: \\cref{fig:example,fig:example_h,fig:example_v}. ... ... An unknown latex tag.\\unknown{zelenyak_molecular_2016} ... ''' >>> processor = LatexTagsToHTML() >>> print(processor.convert(source,{})) <BLANKLINE> References to <a href="{id_home_prefix}fig:example">fig. 1</a>, <a href="{id_home_prefix}tbl:example">tbl. 1</a>, <a href="{id_home_prefix}eqn:example_sympy">eqn. 1</a> and <a href="{id_home_prefix}code:example_mpl">code. 1</a>. <BLANKLINE> Referencing multiple items: <a href="{id_home_prefix}fig:example">fig. 1</a>, <a href="{id_home_prefix}fig:example_h">fig. 2</a> and <a href="{id_home_prefix}fig:example_v">fig. 3</a>. <BLANKLINE> An unknown latex tag. <BLANKLINE> """ new = source in_equation = False labels = [] for tag in re.findall(self.regex, source): if tag.startswith('\\label'): link = r'<a id="{label}" class="anchor-link" name="#{label}">&#182;</a>'.format( label=tag[7:-1]) if in_equation: labels.append(link) new = new.replace(tag, '') else: new = new.replace(tag, link) elif tag.startswith('\\ref'): names = tag[5:-1].split(',') html = [] for name in names: html.append(self.replace_reflabel(name, resources)) new = new.replace(tag, self.rreplace( ', '.join(html), ',', ' and')) elif tag.startswith('\\cref'): names = tag[6:-1].split(',') html = [] for name in names: html.append(self.replace_reflabel(name, resources)) new = new.replace(tag, self.rreplace( ', '.join(html), ',', ' and')) elif tag.startswith('\\cite'): names = tag[6:-1].split(',') html = [] for name in names: if name in self.bibdatabase: html.append(self.process_bib_entry( self.bibdatabase[name])) else: html.append('Unresolved citation: {}.'.format(name)) new = new.replace(tag, '[' + ', '.join(html) + ']') elif any([tag.startswith('\\begin{{{0}}}'.format(env)) for env in ['equation', 'equation*', 'align', 'align*', 'multline', 'multline*', 'gather', 'gather*']]): in_equation = True elif any([tag.startswith('\\end{{{0}}}'.format(env)) for env in ['equation', 'equation*', 'align', 'align*', 'multline', 'multline*', 'gather', 'gather*']]): new += ' '.join(labels) labels = [] in_equation = False elif any([tag.startswith('\\begin{{{0}}}'.format(env)) for env in ['split']]): pass elif any([tag.startswith('\\end{{{0}}}'.format(env)) for env in ['split']]): pass else: new = new.replace(tag, '') return new
[docs] def preprocess(self, nb, resources): logging.info('converting latex tags to html') if 'bibliopath' in resources: self.bibdatabase = self.read_bibliography(resources['bibliopath']) else: self.bibdatabase = {} for cell in nb.cells: if "ipub" in cell['metadata']: for key in cell['metadata']["ipub"]: if not isinstance(cell['metadata']["ipub"][key], dict): continue if "caption" in cell['metadata']["ipub"][key]: text = cell['metadata']["ipub"][key]["caption"] cell['metadata']["ipub"][key]["caption"] = self.convert( text, resources) if not cell['cell_type'] == "markdown": continue cell['source'] = self.convert(cell['source'], resources) resources['refslide'] = {} return nb, resources