Source code for ipypublish.preprocessors.latextags_to_html

import logging
import re
import string
import io

import bibtexparser
import traitlets as traits
from nbconvert.preprocessors import Preprocessor

from six import string_types


[docs]class DefaultFormatter(string.Formatter):
    def __init__(self, default=""):
        self.default = default

[docs]    def get_value(self, key, args, kwds):
        if isinstance(key, string_types):
            return kwds.get(key, self.default.format(key))
        else:
            string.Formatter.get_value(key, args, kwds)


[docs]def safe_str(obj):
    if hasattr(obj, "decode"):
        try:
            obj = obj.decode("utf-8")
        except UnicodeEncodeError:
            pass
    try:
        return str(obj)
    except UnicodeEncodeError:
        # python 2.7
        obj = re.sub(u"\u2013", "-", obj)  # en dash
        obj = re.sub(u"\u2014", "--", obj)  # em dash
        return obj.encode("ascii", "ignore").decode("ascii")
    return ""


[docs]class LatexTagsToHTML(Preprocessor):
    r""" a preprocessor to find latex tags
    (like ``\cite{abc}`` or ``\todo[color]{stuff}``) and:

    1. attempt to process them into a html friendly format
    2. remove them entirely if this is not possible

    for ``\ref`` or ``\cref``,
    attempts to use resources.refmap to map labels to reference names
    for labels not found in resources.refmap
    the reference name is '<name> <number>', where:

    - <name> is either ref of, if labelbycolon is True and
      the label has a colon, all text before the colon
    - <number> iterate by order of first appearance of a particular label


    NB: should be applied after LatexDocHTML,
    if you want resources.refmap to be available

    Examples
    --------
    >>> from nbformat import NotebookNode
    >>> from jsonextended.utils import MockPath

    >>> processor = LatexTagsToHTML()

    >>> bibfile = MockPath(is_file=True,content='''
    ... @article{bibkey,
    ... title = {the title},
    ... doi = {10.1134/S0018143916050209},
    ... author = {Surname, A. Name},
    ... date = {2016-09-01},
    ... }
    ... ''')
    >>> resources = NotebookNode(
    ...    {'bibliopath':bibfile, 'refmap':{"label":"label_name"}})

    >>> cell = NotebookNode({
    ... "cell_type":"markdown",
    ... "metadata":{},
    ... "source":"test"
    ... })
    >>> nb = NotebookNode({"cells":[cell]})
    >>> nb, _ = processor.preprocess(nb,resources)
    >>> print(nb.cells[0].source)
    test

    >>> cell.source = "\\unknown{test}"
    >>> nb, _ = processor.preprocess(nb,resources)
    >>> print(nb.cells[0].source)
    <BLANKLINE>

    >>> cell.source = "\\ref{label}\\unknown{test}"
    >>> nb, _ = processor.preprocess(nb,resources)
    >>> print(nb.cells[0].source)
    <a href="{id_home_prefix}label">label_name</a>

    >>> cell.source = "\\label{test}"
    >>> nb, _ = processor.preprocess(nb,resources)
    >>> print(nb.cells[0].source)
    <a id="test" class="anchor-link" name="#test">&#182;</a>

    >>> cell.source = "\\cite{bibkey}"
    >>> nb, _ = processor.preprocess(nb,resources)
    >>> print(nb.cells[0].source)
    [<a href="https://doi.org/10.1134/S0018143916050209">Surname <em>et al</em>, 2016.</a>]

    >>> cell.source = "\\begin{equation}x=a+b\\end{equation}"
    >>> nb, _ = processor.preprocess(nb,resources)
    >>> print(nb.cells[0].source)
    \begin{equation}x=a+b\end{equation}

    """  # noqa: E501

    regex = traits.Unicode(
        r"\\(?:[^a-zA-Z]|[a-zA-Z]+[*=']?)(?:\[.*?\])?{.*?}",
        help="the regex to identify latex tags",
    ).tag(config=True)
    bibformat = traits.Unicode(
        "{author}, {year}.",
        help="the format to output \\cite{} tags found in the bibliography",
    ).tag(config=True)
    labelbycolon = traits.Bool(
        True,
        help=(
            "create reference label based on text before colon, "
            "e.g. \\ref{fig:example} -> fig 1"
        ),
    ).tag(config=True)

    def __init__(self, *args, **kwargs):
        # a dictionary to keep track of references,
        # so they each get a different number
        self.refs = {}
        # bibliography references
        self.bibdatabase = {}
        super(LatexTagsToHTML, self).__init__(*args, **kwargs)

[docs]    @staticmethod
    def read_bibliography(path):
        """ read a bibliography

        """
        logging.info("reading bibliopath: {}".format(path))
        bibdatabase = {}
        bibparser = bibtexparser.bparser.BibTexParser()
        try:
            if hasattr(path, "open"):
                with path.open(encoding="utf8") as bibtex_file:
                    bibtex_data = bibtex_file.read()
            else:
                with io.open(path, encoding="utf8") as bibtex_file:
                    bibtex_data = bibtex_file.read()
            bibtex_data = safe_str(bibtex_data)
            bibdatabase = bibparser.parse(bibtex_data).entries_dict
        except Exception as err:
            logging.error("could not read bibliopath {}: {}".format(path, err))

        return bibdatabase

[docs]    def rreplace(self, source, target, replacement, replacements=1):
        """replace in string, from right-to-left"""
        return replacement.join(source.rsplit(target, replacements))

[docs]    def process_bib_entry(self, entry):
        """work out the best way to represent the bib entry """

        # abbreviate a list of authors
        if "author" in entry:
            authors = re.split(", | and ", entry["author"])
            if len(authors) > 1:
                author = authors[0] + " <em>et al</em>"
            else:
                author = authors[0]
            entry["author"] = author

            # split up date into year, month, day
        if "date" in entry:
            date = entry["date"].split("-")
            if len(date) == 3:
                entry["year"] = date[0]
                entry["month"] = date[1]
                entry["day"] = date[2]
            else:
                entry["year"] = date[0]

        text = DefaultFormatter().format(self.bibformat, **entry)

        if "doi" in entry:
            return r'<a href="https://doi.org/{doi}">{text}</a>'.format(
                doi=entry["doi"], text=text
            )
        elif "url" in entry:
            return r'<a href="{url}">{text}</a>'.format(url=entry["url"], text=text)
        elif "link" in entry:
            return r'<a href="{url}">{text}</a>'.format(url=entry["link"], text=text)
        else:
            return text

[docs]    def replace_reflabel(self, name, resources):
        """ find a suitable html replacement for a reference label

        the links are left with a format hook in them: {id_home_prefix},
        so that an nbconvert filter can later replace it
        this is particularly useful for slides,
        which require a prefix #/<slide_number><label>
        """
        if "refmap" in resources:
            if name in resources["refmap"]:
                return r'<a href="{{id_home_prefix}}{0}">{1}</a>'.format(
                    name, resources["refmap"][name]
                )

        if self.labelbycolon:
            ref_name = name.split(":")[0] if ":" in name else "ref"
        else:
            ref_name = "ref"
        if ref_name not in self.refs:
            self.refs[ref_name] = {}
        refs = self.refs[ref_name]
        if name in refs:
            id = refs[name]
        else:
            id = len(refs) + 1
            refs[name] = id
        return r'<a href="{{id_home_prefix}}{0}">{1}. {2}</a>'.format(
            name, ref_name, id
        )

[docs]    def convert(self, source, resources):
        """ convert a a string with tags in

        Example
        -------

        >>> source = r'''
        ... References to \\cref{fig:example}, \\cref{tbl:example}, \\cref{eqn:example_sympy} and \\cref{code:example_mpl}.
        ...
        ... Referencing multiple items: \\cref{fig:example,fig:example_h,fig:example_v}.
        ...
        ... An unknown latex tag.\\unknown{zelenyak_molecular_2016}
        ... '''
        >>> processor = LatexTagsToHTML()
        >>> print(processor.convert(source,{}))
        <BLANKLINE>
        References to <a href="{id_home_prefix}fig:example">fig. 1</a>, <a href="{id_home_prefix}tbl:example">tbl. 1</a>, <a href="{id_home_prefix}eqn:example_sympy">eqn. 1</a> and <a href="{id_home_prefix}code:example_mpl">code. 1</a>.
        <BLANKLINE>
        Referencing multiple items: <a href="{id_home_prefix}fig:example">fig. 1</a>, <a href="{id_home_prefix}fig:example_h">fig. 2</a> and <a href="{id_home_prefix}fig:example_v">fig. 3</a>.
        <BLANKLINE>
        An unknown latex tag.
        <BLANKLINE>

        """  # noqa: E501
        new = source
        in_equation = False
        labels = []
        for tag in re.findall(self.regex, source):

            if tag.startswith("\\label"):
                link = r'<a id="{label}" class="anchor-link" name="#{label}">&#182;</a>'.format(
                    label=tag[7:-1]
                )  # noqa: E501
                if in_equation:
                    labels.append(link)
                    new = new.replace(tag, "")
                else:
                    new = new.replace(tag, link)

            elif tag.startswith("\\ref"):
                names = tag[5:-1].split(",")
                html = []
                for name in names:
                    html.append(self.replace_reflabel(name, resources))
                new = new.replace(tag, self.rreplace(", ".join(html), ",", " and"))

            elif tag.startswith("\\cref"):
                names = tag[6:-1].split(",")
                html = []
                for name in names:
                    html.append(self.replace_reflabel(name, resources))
                new = new.replace(tag, self.rreplace(", ".join(html), ",", " and"))

            elif tag.startswith("\\cite"):
                names = tag[6:-1].split(",")
                html = []
                for name in names:
                    if name in self.bibdatabase:
                        html.append(self.process_bib_entry(self.bibdatabase[name]))
                    else:
                        html.append("Unresolved citation: {}.".format(name))
                new = new.replace(tag, "[" + ", ".join(html) + "]")

            elif any(
                [
                    tag.startswith("\\begin{{{0}}}".format(env))
                    for env in [
                        "equation",
                        "equation*",
                        "align",
                        "align*",
                        "multline",
                        "multline*",
                        "gather",
                        "gather*",
                    ]
                ]
            ):
                in_equation = True
            elif any(
                [
                    tag.startswith("\\end{{{0}}}".format(env))
                    for env in [
                        "equation",
                        "equation*",
                        "align",
                        "align*",
                        "multline",
                        "multline*",
                        "gather",
                        "gather*",
                    ]
                ]
            ):
                new += " ".join(labels)
                labels = []
                in_equation = False
            elif any(
                [tag.startswith("\\begin{{{0}}}".format(env)) for env in ["split"]]
            ):
                pass
            elif any([tag.startswith("\\end{{{0}}}".format(env)) for env in ["split"]]):
                pass
            else:
                new = new.replace(tag, "")
        return new

[docs]    def preprocess(self, nb, resources):

        logging.info("converting latex tags to html")
        if "bibliopath" in resources:
            self.bibdatabase = self.read_bibliography(resources["bibliopath"])
        else:
            self.bibdatabase = {}

        for cell in nb.cells:

            if "ipub" in cell["metadata"]:
                for key in cell["metadata"]["ipub"]:
                    if not isinstance(cell["metadata"]["ipub"][key], dict):
                        continue
                    if "caption" in cell["metadata"]["ipub"][key]:
                        text = cell["metadata"]["ipub"][key]["caption"]
                        key_dict = cell["metadata"]["ipub"][key]
                        key_dict["caption"] = self.convert(text, resources)

            if not cell["cell_type"] == "markdown":
                continue
            cell["source"] = self.convert(cell["source"], resources)

        resources["refslide"] = {}
        return nb, resources