import logging
import re
import string
import io
import bibtexparser
import traitlets as traits
from nbconvert.preprocessors import Preprocessor
from six import string_types
[docs]def safe_str(obj):
if hasattr(obj, "decode"):
try:
obj = obj.decode("utf-8")
except UnicodeEncodeError:
pass
try:
return str(obj)
except UnicodeEncodeError:
# python 2.7
obj = re.sub(u"\u2013", "-", obj) # en dash
obj = re.sub(u"\u2014", "--", obj) # em dash
return obj.encode("ascii", "ignore").decode("ascii")
return ""
[docs]class LatexTagsToHTML(Preprocessor):
r""" a preprocessor to find latex tags
(like ``\cite{abc}`` or ``\todo[color]{stuff}``) and:
1. attempt to process them into a html friendly format
2. remove them entirely if this is not possible
for ``\ref`` or ``\cref``,
attempts to use resources.refmap to map labels to reference names
for labels not found in resources.refmap
the reference name is '<name> <number>', where:
- <name> is either ref of, if labelbycolon is True and
the label has a colon, all text before the colon
- <number> iterate by order of first appearance of a particular label
NB: should be applied after LatexDocHTML,
if you want resources.refmap to be available
Examples
--------
>>> from nbformat import NotebookNode
>>> from jsonextended.utils import MockPath
>>> processor = LatexTagsToHTML()
>>> bibfile = MockPath(is_file=True,content='''
... @article{bibkey,
... title = {the title},
... doi = {10.1134/S0018143916050209},
... author = {Surname, A. Name},
... date = {2016-09-01},
... }
... ''')
>>> resources = NotebookNode(
... {'bibliopath':bibfile, 'refmap':{"label":"label_name"}})
>>> cell = NotebookNode({
... "cell_type":"markdown",
... "metadata":{},
... "source":"test"
... })
>>> nb = NotebookNode({"cells":[cell]})
>>> nb, _ = processor.preprocess(nb,resources)
>>> print(nb.cells[0].source)
test
>>> cell.source = "\\unknown{test}"
>>> nb, _ = processor.preprocess(nb,resources)
>>> print(nb.cells[0].source)
<BLANKLINE>
>>> cell.source = "\\ref{label}\\unknown{test}"
>>> nb, _ = processor.preprocess(nb,resources)
>>> print(nb.cells[0].source)
<a href="{id_home_prefix}label">label_name</a>
>>> cell.source = "\\label{test}"
>>> nb, _ = processor.preprocess(nb,resources)
>>> print(nb.cells[0].source)
<a id="test" class="anchor-link" name="#test">¶</a>
>>> cell.source = "\\cite{bibkey}"
>>> nb, _ = processor.preprocess(nb,resources)
>>> print(nb.cells[0].source)
[<a href="https://doi.org/10.1134/S0018143916050209">Surname <em>et al</em>, 2016.</a>]
>>> cell.source = "\\begin{equation}x=a+b\\end{equation}"
>>> nb, _ = processor.preprocess(nb,resources)
>>> print(nb.cells[0].source)
\begin{equation}x=a+b\end{equation}
""" # noqa: E501
regex = traits.Unicode(
r"\\(?:[^a-zA-Z]|[a-zA-Z]+[*=']?)(?:\[.*?\])?{.*?}",
help="the regex to identify latex tags",
).tag(config=True)
bibformat = traits.Unicode(
"{author}, {year}.",
help="the format to output \\cite{} tags found in the bibliography",
).tag(config=True)
labelbycolon = traits.Bool(
True,
help=(
"create reference label based on text before colon, "
"e.g. \\ref{fig:example} -> fig 1"
),
).tag(config=True)
def __init__(self, *args, **kwargs):
# a dictionary to keep track of references,
# so they each get a different number
self.refs = {}
# bibliography references
self.bibdatabase = {}
super(LatexTagsToHTML, self).__init__(*args, **kwargs)
[docs] @staticmethod
def read_bibliography(path):
""" read a bibliography
"""
logging.info("reading bibliopath: {}".format(path))
bibdatabase = {}
bibparser = bibtexparser.bparser.BibTexParser()
try:
if hasattr(path, "open"):
with path.open(encoding="utf8") as bibtex_file:
bibtex_data = bibtex_file.read()
else:
with io.open(path, encoding="utf8") as bibtex_file:
bibtex_data = bibtex_file.read()
bibtex_data = safe_str(bibtex_data)
bibdatabase = bibparser.parse(bibtex_data).entries_dict
except Exception as err:
logging.error("could not read bibliopath {}: {}".format(path, err))
return bibdatabase
[docs] def rreplace(self, source, target, replacement, replacements=1):
"""replace in string, from right-to-left"""
return replacement.join(source.rsplit(target, replacements))
[docs] def process_bib_entry(self, entry):
"""work out the best way to represent the bib entry """
# abbreviate a list of authors
if "author" in entry:
authors = re.split(", | and ", entry["author"])
if len(authors) > 1:
author = authors[0] + " <em>et al</em>"
else:
author = authors[0]
entry["author"] = author
# split up date into year, month, day
if "date" in entry:
date = entry["date"].split("-")
if len(date) == 3:
entry["year"] = date[0]
entry["month"] = date[1]
entry["day"] = date[2]
else:
entry["year"] = date[0]
text = DefaultFormatter().format(self.bibformat, **entry)
if "doi" in entry:
return r'<a href="https://doi.org/{doi}">{text}</a>'.format(
doi=entry["doi"], text=text
)
elif "url" in entry:
return r'<a href="{url}">{text}</a>'.format(url=entry["url"], text=text)
elif "link" in entry:
return r'<a href="{url}">{text}</a>'.format(url=entry["link"], text=text)
else:
return text
[docs] def replace_reflabel(self, name, resources):
""" find a suitable html replacement for a reference label
the links are left with a format hook in them: {id_home_prefix},
so that an nbconvert filter can later replace it
this is particularly useful for slides,
which require a prefix #/<slide_number><label>
"""
if "refmap" in resources:
if name in resources["refmap"]:
return r'<a href="{{id_home_prefix}}{0}">{1}</a>'.format(
name, resources["refmap"][name]
)
if self.labelbycolon:
ref_name = name.split(":")[0] if ":" in name else "ref"
else:
ref_name = "ref"
if ref_name not in self.refs:
self.refs[ref_name] = {}
refs = self.refs[ref_name]
if name in refs:
id = refs[name]
else:
id = len(refs) + 1
refs[name] = id
return r'<a href="{{id_home_prefix}}{0}">{1}. {2}</a>'.format(
name, ref_name, id
)
[docs] def convert(self, source, resources):
""" convert a a string with tags in
Example
-------
>>> source = r'''
... References to \\cref{fig:example}, \\cref{tbl:example}, \\cref{eqn:example_sympy} and \\cref{code:example_mpl}.
...
... Referencing multiple items: \\cref{fig:example,fig:example_h,fig:example_v}.
...
... An unknown latex tag.\\unknown{zelenyak_molecular_2016}
... '''
>>> processor = LatexTagsToHTML()
>>> print(processor.convert(source,{}))
<BLANKLINE>
References to <a href="{id_home_prefix}fig:example">fig. 1</a>, <a href="{id_home_prefix}tbl:example">tbl. 1</a>, <a href="{id_home_prefix}eqn:example_sympy">eqn. 1</a> and <a href="{id_home_prefix}code:example_mpl">code. 1</a>.
<BLANKLINE>
Referencing multiple items: <a href="{id_home_prefix}fig:example">fig. 1</a>, <a href="{id_home_prefix}fig:example_h">fig. 2</a> and <a href="{id_home_prefix}fig:example_v">fig. 3</a>.
<BLANKLINE>
An unknown latex tag.
<BLANKLINE>
""" # noqa: E501
new = source
in_equation = False
labels = []
for tag in re.findall(self.regex, source):
if tag.startswith("\\label"):
link = r'<a id="{label}" class="anchor-link" name="#{label}">¶</a>'.format(
label=tag[7:-1]
) # noqa: E501
if in_equation:
labels.append(link)
new = new.replace(tag, "")
else:
new = new.replace(tag, link)
elif tag.startswith("\\ref"):
names = tag[5:-1].split(",")
html = []
for name in names:
html.append(self.replace_reflabel(name, resources))
new = new.replace(tag, self.rreplace(", ".join(html), ",", " and"))
elif tag.startswith("\\cref"):
names = tag[6:-1].split(",")
html = []
for name in names:
html.append(self.replace_reflabel(name, resources))
new = new.replace(tag, self.rreplace(", ".join(html), ",", " and"))
elif tag.startswith("\\cite"):
names = tag[6:-1].split(",")
html = []
for name in names:
if name in self.bibdatabase:
html.append(self.process_bib_entry(self.bibdatabase[name]))
else:
html.append("Unresolved citation: {}.".format(name))
new = new.replace(tag, "[" + ", ".join(html) + "]")
elif any(
[
tag.startswith("\\begin{{{0}}}".format(env))
for env in [
"equation",
"equation*",
"align",
"align*",
"multline",
"multline*",
"gather",
"gather*",
]
]
):
in_equation = True
elif any(
[
tag.startswith("\\end{{{0}}}".format(env))
for env in [
"equation",
"equation*",
"align",
"align*",
"multline",
"multline*",
"gather",
"gather*",
]
]
):
new += " ".join(labels)
labels = []
in_equation = False
elif any(
[tag.startswith("\\begin{{{0}}}".format(env)) for env in ["split"]]
):
pass
elif any([tag.startswith("\\end{{{0}}}".format(env)) for env in ["split"]]):
pass
else:
new = new.replace(tag, "")
return new
[docs] def preprocess(self, nb, resources):
logging.info("converting latex tags to html")
if "bibliopath" in resources:
self.bibdatabase = self.read_bibliography(resources["bibliopath"])
else:
self.bibdatabase = {}
for cell in nb.cells:
if "ipub" in cell["metadata"]:
for key in cell["metadata"]["ipub"]:
if not isinstance(cell["metadata"]["ipub"][key], dict):
continue
if "caption" in cell["metadata"]["ipub"][key]:
text = cell["metadata"]["ipub"][key]["caption"]
key_dict = cell["metadata"]["ipub"][key]
key_dict["caption"] = self.convert(text, resources)
if not cell["cell_type"] == "markdown":
continue
cell["source"] = self.convert(cell["source"], resources)
resources["refslide"] = {}
return nb, resources