Source code for ipypublish.preprocessors.latextags_to_html

import logging
import re
import string
import io

import bibtexparser
import traitlets as traits
from nbconvert.preprocessors import Preprocessor

from six import string_types


[docs]class DefaultFormatter(string.Formatter): def __init__(self, default=""): self.default = default
[docs] def get_value(self, key, args, kwds): if isinstance(key, string_types): return kwds.get(key, self.default.format(key)) else: string.Formatter.get_value(key, args, kwds)
[docs]def safe_str(obj): if hasattr(obj, "decode"): try: obj = obj.decode("utf-8") except UnicodeEncodeError: pass try: return str(obj) except UnicodeEncodeError: # python 2.7 obj = re.sub(u"\u2013", "-", obj) # en dash obj = re.sub(u"\u2014", "--", obj) # em dash return obj.encode("ascii", "ignore").decode("ascii") return ""
[docs]class LatexTagsToHTML(Preprocessor): r""" a preprocessor to find latex tags (like ``\cite{abc}`` or ``\todo[color]{stuff}``) and: 1. attempt to process them into a html friendly format 2. remove them entirely if this is not possible for ``\ref`` or ``\cref``, attempts to use resources.refmap to map labels to reference names for labels not found in resources.refmap the reference name is '<name> <number>', where: - <name> is either ref of, if labelbycolon is True and the label has a colon, all text before the colon - <number> iterate by order of first appearance of a particular label NB: should be applied after LatexDocHTML, if you want resources.refmap to be available Examples -------- >>> from nbformat import NotebookNode >>> from jsonextended.utils import MockPath >>> processor = LatexTagsToHTML() >>> bibfile = MockPath(is_file=True,content=''' ... @article{bibkey, ... title = {the title}, ... doi = {10.1134/S0018143916050209}, ... author = {Surname, A. Name}, ... date = {2016-09-01}, ... } ... ''') >>> resources = NotebookNode( ... {'bibliopath':bibfile, 'refmap':{"label":"label_name"}}) >>> cell = NotebookNode({ ... "cell_type":"markdown", ... "metadata":{}, ... "source":"test" ... }) >>> nb = NotebookNode({"cells":[cell]}) >>> nb, _ = processor.preprocess(nb,resources) >>> print(nb.cells[0].source) test >>> cell.source = "\\unknown{test}" >>> nb, _ = processor.preprocess(nb,resources) >>> print(nb.cells[0].source) <BLANKLINE> >>> cell.source = "\\ref{label}\\unknown{test}" >>> nb, _ = processor.preprocess(nb,resources) >>> print(nb.cells[0].source) <a href="{id_home_prefix}label">label_name</a> >>> cell.source = "\\label{test}" >>> nb, _ = processor.preprocess(nb,resources) >>> print(nb.cells[0].source) <a id="test" class="anchor-link" name="#test">&#182;</a> >>> cell.source = "\\cite{bibkey}" >>> nb, _ = processor.preprocess(nb,resources) >>> print(nb.cells[0].source) [<a href="https://doi.org/10.1134/S0018143916050209">Surname <em>et al</em>, 2016.</a>] >>> cell.source = "\\begin{equation}x=a+b\\end{equation}" >>> nb, _ = processor.preprocess(nb,resources) >>> print(nb.cells[0].source) \begin{equation}x=a+b\end{equation} """ # noqa: E501 regex = traits.Unicode( r"\\(?:[^a-zA-Z]|[a-zA-Z]+[*=']?)(?:\[.*?\])?{.*?}", help="the regex to identify latex tags", ).tag(config=True) bibformat = traits.Unicode( "{author}, {year}.", help="the format to output \\cite{} tags found in the bibliography", ).tag(config=True) labelbycolon = traits.Bool( True, help=( "create reference label based on text before colon, " "e.g. \\ref{fig:example} -> fig 1" ), ).tag(config=True) def __init__(self, *args, **kwargs): # a dictionary to keep track of references, # so they each get a different number self.refs = {} # bibliography references self.bibdatabase = {} super(LatexTagsToHTML, self).__init__(*args, **kwargs)
[docs] @staticmethod def read_bibliography(path): """ read a bibliography """ logging.info("reading bibliopath: {}".format(path)) bibdatabase = {} bibparser = bibtexparser.bparser.BibTexParser() try: if hasattr(path, "open"): with path.open(encoding="utf8") as bibtex_file: bibtex_data = bibtex_file.read() else: with io.open(path, encoding="utf8") as bibtex_file: bibtex_data = bibtex_file.read() bibtex_data = safe_str(bibtex_data) bibdatabase = bibparser.parse(bibtex_data).entries_dict except Exception as err: logging.error("could not read bibliopath {}: {}".format(path, err)) return bibdatabase
[docs] def rreplace(self, source, target, replacement, replacements=1): """replace in string, from right-to-left""" return replacement.join(source.rsplit(target, replacements))
[docs] def process_bib_entry(self, entry): """work out the best way to represent the bib entry """ # abbreviate a list of authors if "author" in entry: authors = re.split(", | and ", entry["author"]) if len(authors) > 1: author = authors[0] + " <em>et al</em>" else: author = authors[0] entry["author"] = author # split up date into year, month, day if "date" in entry: date = entry["date"].split("-") if len(date) == 3: entry["year"] = date[0] entry["month"] = date[1] entry["day"] = date[2] else: entry["year"] = date[0] text = DefaultFormatter().format(self.bibformat, **entry) if "doi" in entry: return r'<a href="https://doi.org/{doi}">{text}</a>'.format( doi=entry["doi"], text=text ) elif "url" in entry: return r'<a href="{url}">{text}</a>'.format(url=entry["url"], text=text) elif "link" in entry: return r'<a href="{url}">{text}</a>'.format(url=entry["link"], text=text) else: return text
[docs] def replace_reflabel(self, name, resources): """ find a suitable html replacement for a reference label the links are left with a format hook in them: {id_home_prefix}, so that an nbconvert filter can later replace it this is particularly useful for slides, which require a prefix #/<slide_number><label> """ if "refmap" in resources: if name in resources["refmap"]: return r'<a href="{{id_home_prefix}}{0}">{1}</a>'.format( name, resources["refmap"][name] ) if self.labelbycolon: ref_name = name.split(":")[0] if ":" in name else "ref" else: ref_name = "ref" if ref_name not in self.refs: self.refs[ref_name] = {} refs = self.refs[ref_name] if name in refs: id = refs[name] else: id = len(refs) + 1 refs[name] = id return r'<a href="{{id_home_prefix}}{0}">{1}. {2}</a>'.format( name, ref_name, id )
[docs] def convert(self, source, resources): """ convert a a string with tags in Example ------- >>> source = r''' ... References to \\cref{fig:example}, \\cref{tbl:example}, \\cref{eqn:example_sympy} and \\cref{code:example_mpl}. ... ... Referencing multiple items: \\cref{fig:example,fig:example_h,fig:example_v}. ... ... An unknown latex tag.\\unknown{zelenyak_molecular_2016} ... ''' >>> processor = LatexTagsToHTML() >>> print(processor.convert(source,{})) <BLANKLINE> References to <a href="{id_home_prefix}fig:example">fig. 1</a>, <a href="{id_home_prefix}tbl:example">tbl. 1</a>, <a href="{id_home_prefix}eqn:example_sympy">eqn. 1</a> and <a href="{id_home_prefix}code:example_mpl">code. 1</a>. <BLANKLINE> Referencing multiple items: <a href="{id_home_prefix}fig:example">fig. 1</a>, <a href="{id_home_prefix}fig:example_h">fig. 2</a> and <a href="{id_home_prefix}fig:example_v">fig. 3</a>. <BLANKLINE> An unknown latex tag. <BLANKLINE> """ # noqa: E501 new = source in_equation = False labels = [] for tag in re.findall(self.regex, source): if tag.startswith("\\label"): link = r'<a id="{label}" class="anchor-link" name="#{label}">&#182;</a>'.format( label=tag[7:-1] ) # noqa: E501 if in_equation: labels.append(link) new = new.replace(tag, "") else: new = new.replace(tag, link) elif tag.startswith("\\ref"): names = tag[5:-1].split(",") html = [] for name in names: html.append(self.replace_reflabel(name, resources)) new = new.replace(tag, self.rreplace(", ".join(html), ",", " and")) elif tag.startswith("\\cref"): names = tag[6:-1].split(",") html = [] for name in names: html.append(self.replace_reflabel(name, resources)) new = new.replace(tag, self.rreplace(", ".join(html), ",", " and")) elif tag.startswith("\\cite"): names = tag[6:-1].split(",") html = [] for name in names: if name in self.bibdatabase: html.append(self.process_bib_entry(self.bibdatabase[name])) else: html.append("Unresolved citation: {}.".format(name)) new = new.replace(tag, "[" + ", ".join(html) + "]") elif any( [ tag.startswith("\\begin{{{0}}}".format(env)) for env in [ "equation", "equation*", "align", "align*", "multline", "multline*", "gather", "gather*", ] ] ): in_equation = True elif any( [ tag.startswith("\\end{{{0}}}".format(env)) for env in [ "equation", "equation*", "align", "align*", "multline", "multline*", "gather", "gather*", ] ] ): new += " ".join(labels) labels = [] in_equation = False elif any( [tag.startswith("\\begin{{{0}}}".format(env)) for env in ["split"]] ): pass elif any([tag.startswith("\\end{{{0}}}".format(env)) for env in ["split"]]): pass else: new = new.replace(tag, "") return new
[docs] def preprocess(self, nb, resources): logging.info("converting latex tags to html") if "bibliopath" in resources: self.bibdatabase = self.read_bibliography(resources["bibliopath"]) else: self.bibdatabase = {} for cell in nb.cells: if "ipub" in cell["metadata"]: for key in cell["metadata"]["ipub"]: if not isinstance(cell["metadata"]["ipub"][key], dict): continue if "caption" in cell["metadata"]["ipub"][key]: text = cell["metadata"]["ipub"][key]["caption"] key_dict = cell["metadata"]["ipub"][key] key_dict["caption"] = self.convert(text, resources) if not cell["cell_type"] == "markdown": continue cell["source"] = self.convert(cell["source"], resources) resources["refslide"] = {} return nb, resources