Skip to content

Module manubot.cite.citekey

Utilities for representing and processing citation keys.

View Source

Utilities for representing and processing citation keys.


import dataclasses

import functools

import logging

import re

import typing as tp


    from functools import cached_property

except ImportError:

    from backports.cached_property import cached_property


class CiteKey:

    input_id: str

    """Input identifier for the citekey"""

    aliases: dict = dataclasses.field(default_factory=dict)

    """Mapping from input identifier to aliases"""

    infer_prefix: bool = True

    """Whether to infer the citekey's prefix when a prefix is missing or unhandled"""

    def __post_init__(self):



    def check_input_id(input_id) -> None:

        if not isinstance(input_id, str):

            raise TypeError(

                "input_id should be type 'str' not "

                f"{type(input_id).__name__!r}: {input_id!r}"


        if input_id.startswith("@"):

            raise ValueError(f"invalid citekey input_id: {input_id!r}\nstarts with '@'")



    def from_input_id(cls, *args, **kwargs) -> "CiteKey":

        """Cached constructor"""

        return cls(*args, **kwargs)


    def dealiased_id(self) -> str:


        If `self.input_id` is in `self.aliases`, the value specified by

        `self.aliases`. Otherwise, `self.input_id`.


        return self.aliases.get(self.input_id, self.input_id)

    def _set_prefix_accession(self) -> None:

        self._prefix = None

        self._accession = None

        split_id = self.dealiased_id.split(":", 1)

        if len(split_id) == 2:

            self._prefix, self._accession = split_id

        if self.infer_prefix and not self.is_known_prefix:


    def _infer_prefix(self) -> None:


        Treat `self.dealiased_id` as missing a prefix.

        If the prefix can be inferred, set `self._prefix` and `self._accession`.

        Only call this function from _set_prefix_accession,

        since it is not safe after instance attributes or properties have been cached.


        from .handlers import infer_prefix

        prefix = infer_prefix(self.dealiased_id)

        if not prefix:


        self._prefix = prefix

        self._accession = self.dealiased_id


    def prefix(self) -> tp.Optional[str]:


        If `self.input_id` contains a colon, the substring up to the first colon.

        Otherwise, None.


        if not hasattr(self, "_prefix"):


        return self._prefix


    def prefix_lower(self) -> tp.Optional[str]:


        A lowercase version of `self.prefix` or None.


        if self.prefix is None:

            return None

        return self.prefix.lower()


    def accession(self) -> tp.Optional[str]:


        If `self.prefix`, the remainder of `self.input_id` following the first colon.


        if not hasattr(self, "_accession"):


        return self._accession


    def standard_prefix(self) -> tp.Optional[str]:


        If the citekey is handled, the standard prefix specified by the handler.

        Otherwise, None.


        if not hasattr(self, "_standard_prefix"):


        return self._standard_prefix


    def standard_accession(self) -> tp.Optional[str]:


        If the citekey is handled, the standard accession specified by the handler.

        Otherwise, None.


        if not hasattr(self, "_standard_accession"):


        return self._standard_accession


    def handler(self):

        from .handlers import Handler, get_handler

        if self.is_handled_prefix:

            return get_handler(self.prefix_lower)

        return Handler(self.prefix_lower)


    def is_handled_prefix(self) -> bool:

        from .handlers import prefix_to_handler

        return self.prefix_lower in prefix_to_handler


    def is_known_prefix(self) -> bool:

        return self.is_handled_prefix or self.is_pandoc_xnos_prefix()

    def inspect(self) -> tp.Optional[str]:


        Inspect citekey for potential problems.

        If no problems are found, return None.

        Otherwise, returns a string describing the problem.


        return self.handler.inspect(self)

    def _standardize(self) -> None:


        Set `self._standard_prefix`, `self._standard_accession`, and `self._standard_id`.

        For citekeys without a prefix or with an unhandled prefix, _standard_prefix

        and _standard_accession are set to None.


        if not self.is_handled_prefix:

            self._standard_prefix = None

            self._standard_accession = None

            self._standard_id = self.dealiased_id


        fxn = self.handler.standardize_prefix_accession

        self._standard_prefix, self._standard_accession = fxn(self.accession)

        self._standard_id = f"{self._standard_prefix}:{self._standard_accession}"


    def standard_id(self) -> str:


        If the citekey is handled, the standard_id specified by the handler.

        Otherwise, `self.dealiased_id`.


        if not hasattr(self, "_standard_id"):


        return self._standard_id


    def short_id(self) -> str:


        A hashed version of standard_id whose characters are

        within the ranges 0-9, a-z and A-Z.


        return shorten_citekey(self.standard_id)


    def all_ids(self) -> tp.List[str]:

        ids = [self.input_id, self.dealiased_id, self.standard_id, self.short_id]

        ids = [x for x in ids if x]  # remove None

        ids = list(dict.fromkeys(ids))  # deduplicate

        return ids

    def __hash__(self):

        return hash((self.input_id, self.dealiased_id))

    def __repr__(self):

        return " --> ".join(

            f"{getattr(self, key)} ({key})"

            for key in (










    def csl_item(self):

        from .csl_item import CSL_Item

        csl_item = self.handler.get_csl_item(self)

        if not isinstance(csl_item, CSL_Item):

            csl_item = CSL_Item(csl_item)


        return csl_item

    def is_pandoc_xnos_prefix(self, log_case_warning: bool = False) -> bool:

        from .handlers import _pandoc_xnos_prefixes

        if self.prefix in _pandoc_xnos_prefixes:

            return True

        if log_case_warning and self.prefix_lower in _pandoc_xnos_prefixes:


                "pandoc-xnos prefixes should be all lowercase.\n"

                f'Should {self.input_id!r} use {self.prefix_lower!r} rather than "{self.prefix!r}"?'


        return False

def shorten_citekey(standard_citekey: str) -> str:


    Return a shortened citekey derived from the input citekey.

    The input citekey should be standardized prior to this function,

    since differences in the input citekey will result in different shortened citekeys.

    Short citekeys are generated by converting the input citekey to a 6 byte hash,

    and then converting this digest to a base62 ASCII str. Shortened

    citekeys consist of characters in the following ranges: 0-9, a-z and A-Z.


    import hashlib

    import base62

    assert not standard_citekey.startswith("@")

    as_bytes = standard_citekey.encode()

    blake_hash = hashlib.blake2b(as_bytes, digest_size=6)

    digest = blake_hash.digest()

    short_citekey = base62.encodebytes(digest)

    return short_citekey

def citekey_to_csl_item(

    citekey, prune=True, manual_refs=None, log_level: tp.Union[str, int] = "WARNING"



    Generate a CSL_Item for the input citekey.


    from manubot import __version__ as manubot_version

    if manual_refs is None:

        manual_refs = {}


    log_level = logging._checkLevel(log_level)

    if not isinstance(citekey, CiteKey):

        citekey = CiteKey(citekey)

    if citekey.standard_id in manual_refs:

        return manual_refs[citekey.standard_id]


        csl_item = citekey.csl_item

    except Exception as error:



            f"Generating csl_item for {citekey.standard_id!r} failed "

            f"due to a {error.__class__.__name__}:\n{error}",

        ), exc_info=True)

        return None

    # update csl_item with manubot generated metadata

    note_text = f"This CSL Item was generated by Manubot v{manubot_version} from its persistent identifier (standard_id)."

    note_dict = {"standard_id": citekey.standard_id}





    return csl_item

def url_to_citekey(url: str) -> str:


    Convert a HTTP(s) URL into a citekey.

    For supported sources, convert from url citekey to an alternative source like doi.

    If citekeys fail inspection, revert alternative sources to URLs.


    from urllib.parse import unquote, urlparse

    citekey = None

    parsed_url = urlparse(url)

    domain_levels = parsed_url.hostname.split(".")

    if domain_levels[-2:] == ["doi", "org"]:

        # DOI URLs

        doi = unquote(parsed_url.path.lstrip("/"))

        citekey = f"doi:{doi}"

    if domain_levels[-2] == "sci-hub":

        # Sci-Hub domains

        doi = parsed_url.path.lstrip("/")

        citekey = f"doi:{doi}"

    if domain_levels[-2:] == ["biorxiv", "org"]:

        # bioRxiv URL to DOI. See

        match =




        if match:

            citekey = f"doi:10.1101/{'biorxiv_id')}"

    is_ncbi_url = parsed_url.hostname.endswith("")

    if is_ncbi_url and parsed_url.path.startswith("/pubmed/"):

        # PubMed URLs


            pmid = parsed_url.path.split("/")[2]

            citekey = f"pmid:{pmid}"

        except IndexError:


    if is_ncbi_url and parsed_url.path.startswith("/pmc/"):

        # PubMed Central URLs


            pmcid = parsed_url.path.split("/")[3]

            citekey = f"pmcid:{pmcid}"

        except IndexError:


    if domain_levels[-2:] == ["wikidata", "org"] and parsed_url.path.startswith(



        # Wikidata URLs


            wikidata_id = parsed_url.path.split("/")[2]

            citekey = f"wikidata:{wikidata_id}"

        except IndexError:


    if domain_levels[-2:] == ["arxiv", "org"]:

        # arXiv identifiers. See


            arxiv_id = parsed_url.path.split("/", maxsplit=2)[2]

            if arxiv_id.endswith(".pdf"):

                arxiv_id = arxiv_id[:-4]

            citekey = f"arxiv:{arxiv_id}"

        except IndexError:


    if citekey is None or CiteKey(citekey).inspect() is not None:

        citekey = f"url:{url}"

    return citekey



def citekey_to_csl_item(
    log_level: Union[str, int] = 'WARNING'

Generate a CSL_Item for the input citekey.

View Source
def citekey_to_csl_item(

    citekey, prune=True, manual_refs=None, log_level: tp.Union[str, int] = "WARNING"



    Generate a CSL_Item for the input citekey.


    from manubot import __version__ as manubot_version

    if manual_refs is None:

        manual_refs = {}


    log_level = logging._checkLevel(log_level)

    if not isinstance(citekey, CiteKey):

        citekey = CiteKey(citekey)

    if citekey.standard_id in manual_refs:

        return manual_refs[citekey.standard_id]


        csl_item = citekey.csl_item

    except Exception as error:



            f"Generating csl_item for {citekey.standard_id!r} failed "

            f"due to a {error.__class__.__name__}:\n{error}",

        ), exc_info=True)

        return None

    # update csl_item with manubot generated metadata

    note_text = f"This CSL Item was generated by Manubot v{manubot_version} from its persistent identifier (standard_id)."

    note_dict = {"standard_id": citekey.standard_id}





    return csl_item


def shorten_citekey(
    standard_citekey: str
) -> str

Return a shortened citekey derived from the input citekey.

The input citekey should be standardized prior to this function, since differences in the input citekey will result in different shortened citekeys. Short citekeys are generated by converting the input citekey to a 6 byte hash, and then converting this digest to a base62 ASCII str. Shortened citekeys consist of characters in the following ranges: 0-9, a-z and A-Z.

View Source
def shorten_citekey(standard_citekey: str) -> str:


    Return a shortened citekey derived from the input citekey.

    The input citekey should be standardized prior to this function,

    since differences in the input citekey will result in different shortened citekeys.

    Short citekeys are generated by converting the input citekey to a 6 byte hash,

    and then converting this digest to a base62 ASCII str. Shortened

    citekeys consist of characters in the following ranges: 0-9, a-z and A-Z.


    import hashlib

    import base62

    assert not standard_citekey.startswith("@")

    as_bytes = standard_citekey.encode()

    blake_hash = hashlib.blake2b(as_bytes, digest_size=6)

    digest = blake_hash.digest()

    short_citekey = base62.encodebytes(digest)

    return short_citekey


def url_to_citekey(
    url: str
) -> str

Convert a HTTP(s) URL into a citekey.

For supported sources, convert from url citekey to an alternative source like doi. If citekeys fail inspection, revert alternative sources to URLs.

View Source
def url_to_citekey(url: str) -> str:


    Convert a HTTP(s) URL into a citekey.

    For supported sources, convert from url citekey to an alternative source like doi.

    If citekeys fail inspection, revert alternative sources to URLs.


    from urllib.parse import unquote, urlparse

    citekey = None

    parsed_url = urlparse(url)

    domain_levels = parsed_url.hostname.split(".")

    if domain_levels[-2:] == ["doi", "org"]:

        # DOI URLs

        doi = unquote(parsed_url.path.lstrip("/"))

        citekey = f"doi:{doi}"

    if domain_levels[-2] == "sci-hub":

        # Sci-Hub domains

        doi = parsed_url.path.lstrip("/")

        citekey = f"doi:{doi}"

    if domain_levels[-2:] == ["biorxiv", "org"]:

        # bioRxiv URL to DOI. See

        match =




        if match:

            citekey = f"doi:10.1101/{'biorxiv_id')}"

    is_ncbi_url = parsed_url.hostname.endswith("")

    if is_ncbi_url and parsed_url.path.startswith("/pubmed/"):

        # PubMed URLs


            pmid = parsed_url.path.split("/")[2]

            citekey = f"pmid:{pmid}"

        except IndexError:


    if is_ncbi_url and parsed_url.path.startswith("/pmc/"):

        # PubMed Central URLs


            pmcid = parsed_url.path.split("/")[3]

            citekey = f"pmcid:{pmcid}"

        except IndexError:


    if domain_levels[-2:] == ["wikidata", "org"] and parsed_url.path.startswith(



        # Wikidata URLs


            wikidata_id = parsed_url.path.split("/")[2]

            citekey = f"wikidata:{wikidata_id}"

        except IndexError:


    if domain_levels[-2:] == ["arxiv", "org"]:

        # arXiv identifiers. See


            arxiv_id = parsed_url.path.split("/", maxsplit=2)[2]

            if arxiv_id.endswith(".pdf"):

                arxiv_id = arxiv_id[:-4]

            citekey = f"arxiv:{arxiv_id}"

        except IndexError:


    if citekey is None or CiteKey(citekey).inspect() is not None:

        citekey = f"url:{url}"

    return citekey



class CiteKey(
    input_id: str,
    aliases: dict = <factory>,
    infer_prefix: bool = True

CiteKey(input_id: str, aliases: dict = , infer_prefix: bool = True)

View Source

class CiteKey:

    input_id: str

    """Input identifier for the citekey"""

    aliases: dict = dataclasses.field(default_factory=dict)

    """Mapping from input identifier to aliases"""

    infer_prefix: bool = True

    """Whether to infer the citekey's prefix when a prefix is missing or unhandled"""

    def __post_init__(self):



    def check_input_id(input_id) -> None:

        if not isinstance(input_id, str):

            raise TypeError(

                "input_id should be type 'str' not "

                f"{type(input_id).__name__!r}: {input_id!r}"


        if input_id.startswith("@"):

            raise ValueError(f"invalid citekey input_id: {input_id!r}\nstarts with '@'")



    def from_input_id(cls, *args, **kwargs) -> "CiteKey":

        """Cached constructor"""

        return cls(*args, **kwargs)


    def dealiased_id(self) -> str:


        If `self.input_id` is in `self.aliases`, the value specified by

        `self.aliases`. Otherwise, `self.input_id`.


        return self.aliases.get(self.input_id, self.input_id)

    def _set_prefix_accession(self) -> None:

        self._prefix = None

        self._accession = None

        split_id = self.dealiased_id.split(":", 1)

        if len(split_id) == 2:

            self._prefix, self._accession = split_id

        if self.infer_prefix and not self.is_known_prefix:


    def _infer_prefix(self) -> None:


        Treat `self.dealiased_id` as missing a prefix.

        If the prefix can be inferred, set `self._prefix` and `self._accession`.

        Only call this function from _set_prefix_accession,

        since it is not safe after instance attributes or properties have been cached.


        from .handlers import infer_prefix

        prefix = infer_prefix(self.dealiased_id)

        if not prefix:


        self._prefix = prefix

        self._accession = self.dealiased_id


    def prefix(self) -> tp.Optional[str]:


        If `self.input_id` contains a colon, the substring up to the first colon.

        Otherwise, None.


        if not hasattr(self, "_prefix"):


        return self._prefix


    def prefix_lower(self) -> tp.Optional[str]:


        A lowercase version of `self.prefix` or None.


        if self.prefix is None:

            return None

        return self.prefix.lower()


    def accession(self) -> tp.Optional[str]:


        If `self.prefix`, the remainder of `self.input_id` following the first colon.


        if not hasattr(self, "_accession"):


        return self._accession


    def standard_prefix(self) -> tp.Optional[str]:


        If the citekey is handled, the standard prefix specified by the handler.

        Otherwise, None.


        if not hasattr(self, "_standard_prefix"):


        return self._standard_prefix


    def standard_accession(self) -> tp.Optional[str]:


        If the citekey is handled, the standard accession specified by the handler.

        Otherwise, None.


        if not hasattr(self, "_standard_accession"):


        return self._standard_accession


    def handler(self):

        from .handlers import Handler, get_handler

        if self.is_handled_prefix:

            return get_handler(self.prefix_lower)

        return Handler(self.prefix_lower)


    def is_handled_prefix(self) -> bool:

        from .handlers import prefix_to_handler

        return self.prefix_lower in prefix_to_handler


    def is_known_prefix(self) -> bool:

        return self.is_handled_prefix or self.is_pandoc_xnos_prefix()

    def inspect(self) -> tp.Optional[str]:


        Inspect citekey for potential problems.

        If no problems are found, return None.

        Otherwise, returns a string describing the problem.


        return self.handler.inspect(self)

    def _standardize(self) -> None:


        Set `self._standard_prefix`, `self._standard_accession`, and `self._standard_id`.

        For citekeys without a prefix or with an unhandled prefix, _standard_prefix

        and _standard_accession are set to None.


        if not self.is_handled_prefix:

            self._standard_prefix = None

            self._standard_accession = None

            self._standard_id = self.dealiased_id


        fxn = self.handler.standardize_prefix_accession

        self._standard_prefix, self._standard_accession = fxn(self.accession)

        self._standard_id = f"{self._standard_prefix}:{self._standard_accession}"


    def standard_id(self) -> str:


        If the citekey is handled, the standard_id specified by the handler.

        Otherwise, `self.dealiased_id`.


        if not hasattr(self, "_standard_id"):


        return self._standard_id


    def short_id(self) -> str:


        A hashed version of standard_id whose characters are

        within the ranges 0-9, a-z and A-Z.


        return shorten_citekey(self.standard_id)


    def all_ids(self) -> tp.List[str]:

        ids = [self.input_id, self.dealiased_id, self.standard_id, self.short_id]

        ids = [x for x in ids if x]  # remove None

        ids = list(dict.fromkeys(ids))  # deduplicate

        return ids

    def __hash__(self):

        return hash((self.input_id, self.dealiased_id))

    def __repr__(self):

        return " --> ".join(

            f"{getattr(self, key)} ({key})"

            for key in (










    def csl_item(self):

        from .csl_item import CSL_Item

        csl_item = self.handler.get_csl_item(self)

        if not isinstance(csl_item, CSL_Item):

            csl_item = CSL_Item(csl_item)


        return csl_item

    def is_pandoc_xnos_prefix(self, log_case_warning: bool = False) -> bool:

        from .handlers import _pandoc_xnos_prefixes

        if self.prefix in _pandoc_xnos_prefixes:

            return True

        if log_case_warning and self.prefix_lower in _pandoc_xnos_prefixes:


                "pandoc-xnos prefixes should be all lowercase.\n"

                f'Should {self.input_id!r} use {self.prefix_lower!r} rather than "{self.prefix!r}"?'


        return False

Class variables


Static methods


def check_input_id(
) -> None
View Source

    def check_input_id(input_id) -> None:

        if not isinstance(input_id, str):

            raise TypeError(

                "input_id should be type 'str' not "

                f"{type(input_id).__name__!r}: {input_id!r}"


        if input_id.startswith("@"):

            raise ValueError(f"invalid citekey input_id: {input_id!r}\nstarts with '@'")


def from_input_id(
) -> 'CiteKey'

Cached constructor

View Source


    def from_input_id(cls, *args, **kwargs) -> "CiteKey":

        """Cached constructor"""

        return cls(*args, **kwargs)

Instance variables


If self.prefix, the remainder of self.input_id following the first colon.


If self.input_id contains a colon, the substring up to the first colon.

Otherwise, None.


A lowercase version of self.prefix or None.


If the citekey is handled, the standard accession specified by the handler.

Otherwise, None.


If the citekey is handled, the standard_id specified by the handler.

Otherwise, self.dealiased_id.


If the citekey is handled, the standard prefix specified by the handler.

Otherwise, None.



def all_ids(


def csl_item(


def dealiased_id(

If self.input_id is in self.aliases, the value specified by

self.aliases. Otherwise, self.input_id.


def handler(


def inspect(
) -> Optional[str]

Inspect citekey for potential problems.

If no problems are found, return None. Otherwise, returns a string describing the problem.

View Source
    def inspect(self) -> tp.Optional[str]:


        Inspect citekey for potential problems.

        If no problems are found, return None.

        Otherwise, returns a string describing the problem.


        return self.handler.inspect(self)


def is_pandoc_xnos_prefix(
    log_case_warning: bool = False
) -> bool
View Source
    def is_pandoc_xnos_prefix(self, log_case_warning: bool = False) -> bool:

        from .handlers import _pandoc_xnos_prefixes

        if self.prefix in _pandoc_xnos_prefixes:

            return True

        if log_case_warning and self.prefix_lower in _pandoc_xnos_prefixes:


                "pandoc-xnos prefixes should be all lowercase.\n"

                f'Should {self.input_id!r} use {self.prefix_lower!r} rather than "{self.prefix!r}"?'


        return False


def short_id(

A hashed version of standard_id whose characters are

within the ranges 0-9, a-z and A-Z.