Skip to content

Module manubot.cite.arxiv

View Source
import logging

import re

import xml.etree.ElementTree

import requests

from manubot.util import get_manubot_user_agent

from .csl_item import CSL_Item

from .handlers import Handler

class Handler_arXiv(Handler):

    standard_prefix = "arxiv"

    prefixes = [

        "arxiv",

    ]

    accession_pattern = re.compile(

        r"(?P<versionless_id>[0-9]{4}\.[0-9]{4,5}|[a-z\-]+(\.[A-Z]{2})?/[0-9]{7})(?P<version>v[0-9]+)?"

    )

    def inspect(self, citekey):

        # https://arxiv.org/help/arxiv_identifier

        if not self._get_pattern().fullmatch(citekey.accession):

            return "arXiv identifiers must conform to syntax described at https://arxiv.org/help/arxiv_identifier."

    def get_csl_item(self, citekey):

        return get_arxiv_csl_item(citekey.standard_accession)

class CSL_Item_arXiv(CSL_Item):

    def _set_invariant_fields(self):

        # Set journal/publisher to arXiv

        self["container-title"] = "arXiv"

        self["publisher"] = "arXiv"

        # Set CSL type to report for preprint

        self["type"] = "report"

        return self

    def log_journal_doi(self, arxiv_id, journal_ref=None):

        if "DOI" not in self:

            return

        msg = f"arXiv article {arxiv_id} published at https://doi.org/{self['DOI']}"

        if journal_ref:

            msg += f" — {journal_ref}"

        logging.info(msg)

    def set_identifier_fields(self, arxiv_id):

        self.set_id(f"arxiv:{arxiv_id}")

        self["URL"] = f"https://arxiv.org/abs/{arxiv_id}"

        self["number"] = arxiv_id

        _, version = split_arxiv_id_version(arxiv_id)

        if version:

            self["version"] = version

def split_arxiv_id_version(arxiv_id: str):

    """

    Return (versionless_id, version) tuple.

    Version refers to the verion suffix like 'v2' or None.

    """

    match = re.match(Handler_arXiv.accession_pattern, arxiv_id)

    return match.group("versionless_id"), match.group("version")

def get_arxiv_csl_item(arxiv_id: str):

    """

    Return csl_item item for an arXiv identifier.

    Chooses which arXiv API to use based on whether arxiv_id

    is versioned, since only one endpoint supports versioning.

    """

    _, version = split_arxiv_id_version(arxiv_id)

    if version:

        return get_arxiv_csl_item_export_api(arxiv_id)

    return get_arxiv_csl_item_oai(arxiv_id)

def query_arxiv_api(url, params):

    headers = {"User-Agent": get_manubot_user_agent()}

    response = requests.get(url, params, headers=headers)

    response.raise_for_status()

    xml_tree = xml.etree.ElementTree.fromstring(response.text)

    return xml_tree

def get_arxiv_csl_item_export_api(arxiv_id):

    """

    Return csl_item item for an arXiv record.

    arxiv_id can be versioned, like `1512.00567v2`, or versionless, like

    `1512.00567`. If versionless, the arXiv API will return metadata for the

    latest version. Legacy IDs, such as `cond-mat/0703470v2`, are also

    supported.

    If arXiv has an associated DOI for the record, a warning is logged to

    alert the user that an alternative version of record exists.

    References:

    - https://arxiv.org/help/api/index

    - http://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html

    - https://github.com/citation-style-language/schema/blob/master/csl-data.json

    """

    xml_tree = query_arxiv_api(

        url="https://export.arxiv.org/api/query",

        params={"id_list": arxiv_id, "max_results": 1},

    )

    # XML namespace prefixes

    prefix = "{http://www.w3.org/2005/Atom}"

    alt_prefix = "{http://arxiv.org/schemas/atom}"

    # Parse XML

    (entry,) = xml_tree.findall(prefix + "entry")

    # Create dictionary for CSL Item

    csl_item = CSL_Item_arXiv()

    # Extract versioned arXiv ID

    url = entry.findtext(prefix + "id")

    pattern = re.compile(r"arxiv.org/abs/(.+)")

    match = pattern.search(url)

    versioned_id = match.group(1)

    csl_item.set_identifier_fields(versioned_id)

    # Extrat CSL title field

    csl_item["title"] = entry.findtext(prefix + "title")

    # Extract CSL date field

    published = entry.findtext(prefix + "published")

    csl_item.set_date(published, variable="issued")

    # Extract authors

    authors = []

    for elem in entry.findall(prefix + "author"):

        name = elem.findtext(prefix + "name")

        author = {"literal": name}

        authors.append(author)

    csl_item["author"] = authors

    csl_item._set_invariant_fields()

    # Extract abstract

    abstract = entry.findtext(prefix + "summary").strip()

    if abstract:

        # remove newlines that were added to wrap abstract

        abstract = remove_newlines(abstract)

        csl_item["abstract"] = abstract

    # Check if the article has been published with a DOI

    doi = entry.findtext(f"{alt_prefix}doi")

    if doi:

        csl_item["DOI"] = doi

        journal_ref = entry.findtext(alt_prefix + "journal_ref")

        csl_item.log_journal_doi(arxiv_id, journal_ref)

    return csl_item

def get_arxiv_csl_item_oai(arxiv_id):

    """

    Generate a CSL Item for an unversioned arXiv identifier

    using arXiv's OAI_PMH v2.0 API <https://arxiv.org/help/oa>.

    This endpoint does not support versioned `arxiv_id`.

    """

    # XML namespace prefixes

    ns_oai = "{http://www.openarchives.org/OAI/2.0/}"

    ns_arxiv = "{http://arxiv.org/OAI/arXiv/}"

    xml_tree = query_arxiv_api(

        url="https://export.arxiv.org/oai2",

        params={

            "verb": "GetRecord",

            "metadataPrefix": "arXiv",

            "identifier": f"oai:arXiv.org:{arxiv_id}",

        },

    )

    # Create dictionary for CSL Item

    csl_item = CSL_Item_arXiv()

    # Extract parent XML elements

    (header_elem,) = xml_tree.findall(

        f"{ns_oai}GetRecord/{ns_oai}record/{ns_oai}header"

    )

    (metadata_elem,) = xml_tree.findall(

        f"{ns_oai}GetRecord/{ns_oai}record/{ns_oai}metadata"

    )

    (arxiv_elem,) = metadata_elem.findall(f"{ns_arxiv}arXiv")

    # Set identifier fields

    response_arxiv_id = arxiv_elem.findtext(f"{ns_arxiv}id")

    if arxiv_id != response_arxiv_id:

        logging.warning(

            "arXiv oai2 query returned a different arxiv_id:"

            f" {arxiv_id} became {response_arxiv_id}"

        )

    csl_item.set_identifier_fields(response_arxiv_id)

    # Set title and date

    title = arxiv_elem.findtext(f"{ns_arxiv}title")

    if title:

        csl_item["title"] = " ".join(title.split())

    datestamp = header_elem.findtext(f"{ns_oai}datestamp")

    csl_item.set_date(datestamp, "issued")

    # Extract authors

    author_elems = arxiv_elem.findall(f"{ns_arxiv}authors/{ns_arxiv}author")

    authors = []

    for author_elem in author_elems:

        author = {}

        given = author_elem.findtext(f"{ns_arxiv}forenames")

        family = author_elem.findtext(f"{ns_arxiv}keyname")

        if given:

            author["given"] = given

        if family:

            author["family"] = family

        authors.append(author)

    csl_item["author"] = authors

    csl_item._set_invariant_fields()

    abstract = arxiv_elem.findtext(f"{ns_arxiv}abstract")

    if abstract:

        csl_item["abstract"] = remove_newlines(abstract)

    license = arxiv_elem.findtext(f"{ns_arxiv}license")

    if license:

        csl_item.note_append_dict({"license": license})

    doi = arxiv_elem.findtext(f"{ns_arxiv}doi")

    if doi:

        csl_item["DOI"] = doi

        journal_ref = arxiv_elem.findtext(f"{ns_arxiv}journal-ref")

        csl_item.log_journal_doi(arxiv_id, journal_ref)

    return csl_item

def remove_newlines(text):

    return re.sub(pattern=r"\n(?!\s)", repl=" ", string=text)

def get_arxiv_csl_item_zotero(arxiv_id):

    """

    Generate CSL JSON Data for an arXiv ID using Zotero's translation-server.

    """

    from manubot.cite.zotero import get_csl_item

    return get_csl_item(f"arxiv:{arxiv_id}")

Functions

get_arxiv_csl_item

def get_arxiv_csl_item(
    arxiv_id: str
)

Return csl_item item for an arXiv identifier.

Chooses which arXiv API to use based on whether arxiv_id is versioned, since only one endpoint supports versioning.

View Source
def get_arxiv_csl_item(arxiv_id: str):

    """

    Return csl_item item for an arXiv identifier.

    Chooses which arXiv API to use based on whether arxiv_id

    is versioned, since only one endpoint supports versioning.

    """

    _, version = split_arxiv_id_version(arxiv_id)

    if version:

        return get_arxiv_csl_item_export_api(arxiv_id)

    return get_arxiv_csl_item_oai(arxiv_id)

get_arxiv_csl_item_export_api

def get_arxiv_csl_item_export_api(
    arxiv_id
)

Return csl_item item for an arXiv record.

arxiv_id can be versioned, like 1512.00567v2, or versionless, like 1512.00567. If versionless, the arXiv API will return metadata for the latest version. Legacy IDs, such as cond-mat/0703470v2, are also supported.

If arXiv has an associated DOI for the record, a warning is logged to alert the user that an alternative version of record exists.

References: - https://arxiv.org/help/api/index - http://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html - https://github.com/citation-style-language/schema/blob/master/csl-data.json

View Source
def get_arxiv_csl_item_export_api(arxiv_id):

    """

    Return csl_item item for an arXiv record.

    arxiv_id can be versioned, like `1512.00567v2`, or versionless, like

    `1512.00567`. If versionless, the arXiv API will return metadata for the

    latest version. Legacy IDs, such as `cond-mat/0703470v2`, are also

    supported.

    If arXiv has an associated DOI for the record, a warning is logged to

    alert the user that an alternative version of record exists.

    References:

    - https://arxiv.org/help/api/index

    - http://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html

    - https://github.com/citation-style-language/schema/blob/master/csl-data.json

    """

    xml_tree = query_arxiv_api(

        url="https://export.arxiv.org/api/query",

        params={"id_list": arxiv_id, "max_results": 1},

    )

    # XML namespace prefixes

    prefix = "{http://www.w3.org/2005/Atom}"

    alt_prefix = "{http://arxiv.org/schemas/atom}"

    # Parse XML

    (entry,) = xml_tree.findall(prefix + "entry")

    # Create dictionary for CSL Item

    csl_item = CSL_Item_arXiv()

    # Extract versioned arXiv ID

    url = entry.findtext(prefix + "id")

    pattern = re.compile(r"arxiv.org/abs/(.+)")

    match = pattern.search(url)

    versioned_id = match.group(1)

    csl_item.set_identifier_fields(versioned_id)

    # Extrat CSL title field

    csl_item["title"] = entry.findtext(prefix + "title")

    # Extract CSL date field

    published = entry.findtext(prefix + "published")

    csl_item.set_date(published, variable="issued")

    # Extract authors

    authors = []

    for elem in entry.findall(prefix + "author"):

        name = elem.findtext(prefix + "name")

        author = {"literal": name}

        authors.append(author)

    csl_item["author"] = authors

    csl_item._set_invariant_fields()

    # Extract abstract

    abstract = entry.findtext(prefix + "summary").strip()

    if abstract:

        # remove newlines that were added to wrap abstract

        abstract = remove_newlines(abstract)

        csl_item["abstract"] = abstract

    # Check if the article has been published with a DOI

    doi = entry.findtext(f"{alt_prefix}doi")

    if doi:

        csl_item["DOI"] = doi

        journal_ref = entry.findtext(alt_prefix + "journal_ref")

        csl_item.log_journal_doi(arxiv_id, journal_ref)

    return csl_item

get_arxiv_csl_item_oai

def get_arxiv_csl_item_oai(
    arxiv_id
)

Generate a CSL Item for an unversioned arXiv identifier

using arXiv's OAI_PMH v2.0 API https://arxiv.org/help/oa. This endpoint does not support versioned arxiv_id.

View Source
def get_arxiv_csl_item_oai(arxiv_id):

    """

    Generate a CSL Item for an unversioned arXiv identifier

    using arXiv's OAI_PMH v2.0 API <https://arxiv.org/help/oa>.

    This endpoint does not support versioned `arxiv_id`.

    """

    # XML namespace prefixes

    ns_oai = "{http://www.openarchives.org/OAI/2.0/}"

    ns_arxiv = "{http://arxiv.org/OAI/arXiv/}"

    xml_tree = query_arxiv_api(

        url="https://export.arxiv.org/oai2",

        params={

            "verb": "GetRecord",

            "metadataPrefix": "arXiv",

            "identifier": f"oai:arXiv.org:{arxiv_id}",

        },

    )

    # Create dictionary for CSL Item

    csl_item = CSL_Item_arXiv()

    # Extract parent XML elements

    (header_elem,) = xml_tree.findall(

        f"{ns_oai}GetRecord/{ns_oai}record/{ns_oai}header"

    )

    (metadata_elem,) = xml_tree.findall(

        f"{ns_oai}GetRecord/{ns_oai}record/{ns_oai}metadata"

    )

    (arxiv_elem,) = metadata_elem.findall(f"{ns_arxiv}arXiv")

    # Set identifier fields

    response_arxiv_id = arxiv_elem.findtext(f"{ns_arxiv}id")

    if arxiv_id != response_arxiv_id:

        logging.warning(

            "arXiv oai2 query returned a different arxiv_id:"

            f" {arxiv_id} became {response_arxiv_id}"

        )

    csl_item.set_identifier_fields(response_arxiv_id)

    # Set title and date

    title = arxiv_elem.findtext(f"{ns_arxiv}title")

    if title:

        csl_item["title"] = " ".join(title.split())

    datestamp = header_elem.findtext(f"{ns_oai}datestamp")

    csl_item.set_date(datestamp, "issued")

    # Extract authors

    author_elems = arxiv_elem.findall(f"{ns_arxiv}authors/{ns_arxiv}author")

    authors = []

    for author_elem in author_elems:

        author = {}

        given = author_elem.findtext(f"{ns_arxiv}forenames")

        family = author_elem.findtext(f"{ns_arxiv}keyname")

        if given:

            author["given"] = given

        if family:

            author["family"] = family

        authors.append(author)

    csl_item["author"] = authors

    csl_item._set_invariant_fields()

    abstract = arxiv_elem.findtext(f"{ns_arxiv}abstract")

    if abstract:

        csl_item["abstract"] = remove_newlines(abstract)

    license = arxiv_elem.findtext(f"{ns_arxiv}license")

    if license:

        csl_item.note_append_dict({"license": license})

    doi = arxiv_elem.findtext(f"{ns_arxiv}doi")

    if doi:

        csl_item["DOI"] = doi

        journal_ref = arxiv_elem.findtext(f"{ns_arxiv}journal-ref")

        csl_item.log_journal_doi(arxiv_id, journal_ref)

    return csl_item

get_arxiv_csl_item_zotero

def get_arxiv_csl_item_zotero(
    arxiv_id
)

Generate CSL JSON Data for an arXiv ID using Zotero's translation-server.

View Source
def get_arxiv_csl_item_zotero(arxiv_id):

    """

    Generate CSL JSON Data for an arXiv ID using Zotero's translation-server.

    """

    from manubot.cite.zotero import get_csl_item

    return get_csl_item(f"arxiv:{arxiv_id}")

query_arxiv_api

def query_arxiv_api(
    url,
    params
)
View Source
def query_arxiv_api(url, params):

    headers = {"User-Agent": get_manubot_user_agent()}

    response = requests.get(url, params, headers=headers)

    response.raise_for_status()

    xml_tree = xml.etree.ElementTree.fromstring(response.text)

    return xml_tree

remove_newlines

def remove_newlines(
    text
)
View Source
def remove_newlines(text):

    return re.sub(pattern=r"\n(?!\s)", repl=" ", string=text)

split_arxiv_id_version

def split_arxiv_id_version(
    arxiv_id: str
)

Return (versionless_id, version) tuple.

Version refers to the verion suffix like 'v2' or None.

View Source
def split_arxiv_id_version(arxiv_id: str):

    """

    Return (versionless_id, version) tuple.

    Version refers to the verion suffix like 'v2' or None.

    """

    match = re.match(Handler_arXiv.accession_pattern, arxiv_id)

    return match.group("versionless_id"), match.group("version")

Classes

CSL_Item_arXiv

class CSL_Item_arXiv(
    dictionary=None,
    **kwargs
)

CSL_Item represents bibliographic information for a single citeable work.

On a technical side CSL_Item is a Python dictionary with extra methods that help cleaning and manipulating it.

These methods relate to: - adding an id key and value for CSL item - correcting bibliographic information and its structure - adding and reading a custom note to CSL item

More information on CSL JSON (a list of CSL_Items) is available at: - https://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html - http://docs.citationstyles.org/en/1.0.1/specification.html#standard-variables - https://github.com/citation-style-language/schema/blob/master/csl-data.json

View Source
class CSL_Item_arXiv(CSL_Item):

    def _set_invariant_fields(self):

        # Set journal/publisher to arXiv

        self["container-title"] = "arXiv"

        self["publisher"] = "arXiv"

        # Set CSL type to report for preprint

        self["type"] = "report"

        return self

    def log_journal_doi(self, arxiv_id, journal_ref=None):

        if "DOI" not in self:

            return

        msg = f"arXiv article {arxiv_id} published at https://doi.org/{self['DOI']}"

        if journal_ref:

            msg += f" — {journal_ref}"

        logging.info(msg)

    def set_identifier_fields(self, arxiv_id):

        self.set_id(f"arxiv:{arxiv_id}")

        self["URL"] = f"https://arxiv.org/abs/{arxiv_id}"

        self["number"] = arxiv_id

        _, version = split_arxiv_id_version(arxiv_id)

        if version:

            self["version"] = version

Ancestors (in MRO)

  • manubot.cite.csl_item.CSL_Item
  • builtins.dict

Class variables

type_mapping

Instance variables

note

Return the value of the "note" field as a string.

If "note" key is not set, return empty string.

note_dict

Return a dictionary with key-value pairs encoded by this CSL Item's note.

Extracts both forms (line-entry and braced-entry) of key-value pairs from the CSL JSON "cheater syntax" https://github.com/Juris-M/citeproc-js-docs/blob/93d7991d42b4a96b74b7281f38e168e365847e40/csl-json/markup.rst#cheater-syntax-for-odd-fields

Assigning to this dict will not update self["note"].

Methods

clean

def clean(
    self,
    prune: bool = True
) -> 'CSL_Item'

Sanitize and touch-up a potentially dirty CSL_Item.

The following steps are performed: - update incorrect values for "type" field when a correct variant is known - remove fields that violate the JSON Schema (if prune=True) - set default value for "type" if missing, since CSL JSON requires type - validate against the CSL JSON schema (if prune=True) to ensure output CSL_Item is clean

View Source
    def clean(self, prune: bool = True) -> "CSL_Item":

        """

        Sanitize and touch-up a potentially dirty CSL_Item.

        The following steps are performed:

        - update incorrect values for "type" field when a correct variant is known

        - remove fields that violate the JSON Schema (if prune=True)

        - set default value for "type" if missing, since CSL JSON requires type

        - validate against the CSL JSON schema (if prune=True) to ensure output

          CSL_Item is clean

        """

        logging.debug(

            f"Starting CSL_Item.clean with{'' if prune else 'out'}"

            f"CSL pruning for id: {self.get('id', 'id not specified')}"

        )

        self.correct_invalid_type()

        if prune:

            self.prune_against_schema()

        self.set_default_type()

        if prune:

            self.validate_against_schema()

        return self

clear

def clear(
    ...
)

D.clear() -> None. Remove all items from D.

copy

def copy(
    ...
)

D.copy() -> a shallow copy of D

correct_invalid_type

def correct_invalid_type(
    self
) -> 'CSL_Item'

Correct invalid CSL item type.

Does nothing if type not present.

For detail see https://github.com/CrossRef/rest-api-doc/issues/187

View Source
    def correct_invalid_type(self) -> "CSL_Item":

        """

        Correct invalid CSL item type.

        Does nothing if `type` not present.

        For detail see https://github.com/CrossRef/rest-api-doc/issues/187

        """

        if "type" in self:

            # Replace a type from in CSL_Item.type_mapping.keys(),

            # leave type intact in other cases.

            t = self["type"]

            self["type"] = self.type_mapping.get(t, t)

        return self

fromkeys

def fromkeys(
    iterable,
    value=None,
    /
)

Create a new dictionary with keys from iterable and values set to value.

get

def get(
    self,
    key,
    default=None,
    /
)

Return the value for key if key is in the dictionary, else default.

get_date

def get_date(
    self,
    variable: str = 'issued',
    fill: bool = False
) -> Optional[str]

Return a CSL date-variable as ISO formatted string:

('YYYY', 'YYYY-MM', 'YYYY-MM-DD', or None).

variable: which CSL JSON date variable to retrieve fill: if True, set missing months to January and missing days to the first day of the month.

View Source
    def get_date(self, variable: str = "issued", fill: bool = False) -> Optional[str]:

        """

        Return a CSL date-variable as ISO formatted string:

        ('YYYY', 'YYYY-MM', 'YYYY-MM-DD', or None).

        variable: which CSL JSON date variable to retrieve

        fill: if True, set missing months to January

            and missing days to the first day of the month.

        """

        try:

            date_parts = self[variable]["date-parts"][0]

        except (IndexError, KeyError):

            return None

        return date_parts_to_string(date_parts, fill=fill)

infer_id

def infer_id(
    self
) -> 'CSL_Item'

Detect and set a non-null/empty value for "id" or else raise a ValueError.

View Source
    def infer_id(self) -> "CSL_Item":

        """

        Detect and set a non-null/empty value for "id" or else raise a ValueError.

        """

        if self.get("standard_citation"):

            # "standard_citation" field is set with a non-null/empty value

            return self.set_id(self.pop("standard_citation"))

        if self.note_dict.get("standard_id"):

            # "standard_id" note field is set with a non-null/empty value

            return self.set_id(self.note_dict["standard_id"])

        if self.get("id"):

            # "id" field exists and is set with a non-null/empty value

            return self.set_id(self["id"])

        raise ValueError(

            "infer_id could not detect a field with a citation / standard_citation. "

            'Consider setting the CSL Item "id" field.'

        )

items

def items(
    ...
)

D.items() -> a set-like object providing a view on D's items

keys

def keys(
    ...
)

D.keys() -> a set-like object providing a view on D's keys

log_journal_doi

def log_journal_doi(
    self,
    arxiv_id,
    journal_ref=None
)
View Source
    def log_journal_doi(self, arxiv_id, journal_ref=None):

        if "DOI" not in self:

            return

        msg = f"arXiv article {arxiv_id} published at https://doi.org/{self['DOI']}"

        if journal_ref:

            msg += f" — {journal_ref}"

        logging.info(msg)

note_append_dict

def note_append_dict(
    self,
    dictionary: dict
) -> None

Append key-value pairs specified by dictionary to the note field of a CSL Item.

Uses the the CSL JSON "cheater syntax" to encode additional values not defined by the CSL JSON schema.

View Source
    def note_append_dict(self, dictionary: dict) -> None:

        """

        Append key-value pairs specified by `dictionary` to the note field of a CSL Item.

        Uses the the [CSL JSON "cheater syntax"](https://github.com/Juris-M/citeproc-js-docs/blob/93d7991d42b4a96b74b7281f38e168e365847e40/csl-json/markup.rst#cheater-syntax-for-odd-fields)

        to encode additional values not defined by the CSL JSON schema.

        """

        for key, value in dictionary.items():

            if not re.fullmatch(r"[A-Z]+|[-_a-z]+", key):

                logging.warning(

                    f"note_append_dict: skipping adding {key!r} because "

                    f"it does not conform to the variable_name syntax as per https://git.io/fjTzW."

                )

                continue

            if "\n" in value:

                logging.warning(

                    f"note_append_dict: skipping adding {key!r} because "

                    f"the value contains a newline: {value!r}"

                )

                continue

            self.note_append_text(f"{key}: {value}")

note_append_text

def note_append_text(
    self,
    text: str
) -> None

Append text to the note field (as a new line) of a CSL Item.

If a line already exists equal to text, do nothing.

View Source
    def note_append_text(self, text: str) -> None:

        """

        Append text to the note field (as a new line) of a CSL Item.

        If a line already exists equal to `text`, do nothing.

        """

        if not text:

            return

        note = self.note

        if re.search(f"^{re.escape(text)}$", note, flags=re.MULTILINE):

            # do not accumulate duplicate lines of text

            # https://github.com/manubot/manubot/issues/258

            return

        if note and not note.endswith("\n"):

            note += "\n"

        note += text

        self.note = note

pop

def pop(
    ...
)

D.pop(k[,d]) -> v, remove specified key and return the corresponding value.

If the key is not found, return the default if given; otherwise, raise a KeyError.

popitem

def popitem(
    self,
    /
)

Remove and return a (key, value) pair as a 2-tuple.

Pairs are returned in LIFO (last-in, first-out) order. Raises KeyError if the dict is empty.

prune_against_schema

def prune_against_schema(
    self
) -> 'CSL_Item'

Remove fields that violate the CSL Item JSON Schema.

View Source
    def prune_against_schema(self) -> "CSL_Item":

        """

        Remove fields that violate the CSL Item JSON Schema.

        """

        from .citeproc import remove_jsonschema_errors

        (csl_item,) = remove_jsonschema_errors([self], in_place=True)

        assert csl_item is self

        return self

set_date

def set_date(
    self,
    date: Union[NoneType, str, datetime.date, datetime.datetime],
    variable: str = 'issued'
) -> 'CSL_Item'

date: date either as a string (in the form YYYY, YYYY-MM, or YYYY-MM-DD)

or as a Python date object (datetime.date or datetime.datetime). variable: which variable to assign the date to.

View Source
    def set_date(

        self,

        date: Union[None, str, datetime.date, datetime.datetime],

        variable: str = "issued",

    ) -> "CSL_Item":

        """

        date: date either as a string (in the form YYYY, YYYY-MM, or YYYY-MM-DD)

            or as a Python date object (datetime.date or datetime.datetime).

        variable: which variable to assign the date to.

        """

        date_parts = date_to_date_parts(date)

        if date_parts:

            self[variable] = {"date-parts": [date_parts]}

        return self

set_default_type

def set_default_type(
    self
) -> 'CSL_Item'

Set type to 'entry', if type not specified.

View Source
    def set_default_type(self) -> "CSL_Item":

        """

        Set type to 'entry', if type not specified.

        """

        self["type"] = self.get("type", "entry")

        return self

set_id

def set_id(
    self,
    id_
) -> 'CSL_Item'
View Source
    def set_id(self, id_) -> "CSL_Item":

        self["id"] = id_

        return self

set_identifier_fields

def set_identifier_fields(
    self,
    arxiv_id
)
View Source
    def set_identifier_fields(self, arxiv_id):

        self.set_id(f"arxiv:{arxiv_id}")

        self["URL"] = f"https://arxiv.org/abs/{arxiv_id}"

        self["number"] = arxiv_id

        _, version = split_arxiv_id_version(arxiv_id)

        if version:

            self["version"] = version

setdefault

def setdefault(
    self,
    key,
    default=None,
    /
)

Insert key with a value of default if key is not in the dictionary.

Return the value for key if key is in the dictionary, else default.

standardize_id

def standardize_id(
    self
) -> 'CSL_Item'

Extract the standard_id (standard citation key) for a csl_item and modify the csl_item in-place to set its "id" field.

The standard_id is extracted from a "standard_citation" field, the "note" field, or the "id" field. The extracted citation is checked for validity and standardized, after which it is the final "standard_id".

Regarding csl_item modification, the csl_item "id" field is set to the standard_citation and the note field is created or updated with key-value pairs for standard_id and original_id.

Note that the Manubot software generally refers to the "id" of a CSL Item as a citekey. However, in this context, we use "id" rather than "citekey" for consistency with CSL's "id" field.

View Source
    def standardize_id(self) -> "CSL_Item":

        """

        Extract the standard_id (standard citation key) for a csl_item and modify the csl_item in-place to set its "id" field.

        The standard_id is extracted from a "standard_citation" field, the "note" field, or the "id" field.

        The extracted citation is checked for validity and standardized, after which it is the final "standard_id".

        Regarding csl_item modification, the csl_item "id" field is set to the standard_citation and the note field

        is created or updated with key-value pairs for standard_id and original_id.

        Note that the Manubot software generally refers to the "id" of a CSL Item as a citekey.

        However, in this context, we use "id" rather than "citekey" for consistency with CSL's "id" field.

        """

        original_id = self.get("id")

        self.infer_id()

        original_standard_id = self["id"]

        citekey = CiteKey(original_standard_id)

        standard_id = citekey.standard_id

        add_to_note = {}

        note_dict = self.note_dict

        if original_id and original_id != standard_id:

            if original_id != note_dict.get("original_id"):

                add_to_note["original_id"] = original_id

        if original_standard_id and original_standard_id != standard_id:

            if original_standard_id != note_dict.get("original_standard_id"):

                add_to_note["original_standard_id"] = original_standard_id

        if standard_id != note_dict.get("standard_id"):

            add_to_note["standard_id"] = standard_id

        self.note_append_dict(dictionary=add_to_note)

        self.set_id(standard_id)

        return self

update

def update(
    ...
)

D.update([E, ]**F) -> None. Update D from dict/iterable E and F.

If E is present and has a .keys() method, then does: for k in E: D[k] = E[k] If E is present and lacks a .keys() method, then does: for k, v in E: D[k] = v In either case, this is followed by: for k in F: D[k] = F[k]

validate_against_schema

def validate_against_schema(
    self
) -> 'CSL_Item'

Confirm that the CSL_Item validates. If not, raises a

jsonschema.exceptions.ValidationError.

View Source
    def validate_against_schema(self) -> "CSL_Item":

        """

        Confirm that the CSL_Item validates. If not, raises a

        jsonschema.exceptions.ValidationError.

        """

        from .citeproc import get_jsonschema_csl_validator

        validator = get_jsonschema_csl_validator()

        validator.validate([self])

        return self

values

def values(
    ...
)

D.values() -> an object providing a view on D's values

Handler_arXiv

class Handler_arXiv(
    prefix_lower: str
)

A Handler is a class that provides support for a certain type of citekey.

For example, a Handler subclass could provide support for DOI citekeys. Subclasses enable custom logic for different citekey prefixes, including how to standardize the citekey and how to retrieve CSL Item metadata.

View Source
class Handler_arXiv(Handler):

    standard_prefix = "arxiv"

    prefixes = [

        "arxiv",

    ]

    accession_pattern = re.compile(

        r"(?P<versionless_id>[0-9]{4}\.[0-9]{4,5}|[a-z\-]+(\.[A-Z]{2})?/[0-9]{7})(?P<version>v[0-9]+)?"

    )

    def inspect(self, citekey):

        # https://arxiv.org/help/arxiv_identifier

        if not self._get_pattern().fullmatch(citekey.accession):

            return "arXiv identifiers must conform to syntax described at https://arxiv.org/help/arxiv_identifier."

    def get_csl_item(self, citekey):

        return get_arxiv_csl_item(citekey.standard_accession)

Ancestors (in MRO)

  • manubot.cite.handlers.Handler

Class variables

accession_pattern
prefixes
standard_prefix

Methods

get_csl_item

def get_csl_item(
    self,
    citekey
)

Return a CSL_Item with bibliographic details for citekey.

View Source
    def get_csl_item(self, citekey):

        return get_arxiv_csl_item(citekey.standard_accession)

inspect

def inspect(
    self,
    citekey
)

Check citekeys adhere to expected formats. If an issue is detected a

string describing the issue is returned. Otherwise returns None.

View Source
    def inspect(self, citekey):

        # https://arxiv.org/help/arxiv_identifier

        if not self._get_pattern().fullmatch(citekey.accession):

            return "arXiv identifiers must conform to syntax described at https://arxiv.org/help/arxiv_identifier."

standardize_prefix_accession

def standardize_prefix_accession(
    self,
    accession: str
) -> Tuple[str, str]

Return (prefix, accession) in standardized form.

This method defaults to returning self.standard_prefix (or self.prefix_lower if standard_prefix is not defined). Subclasses can override this method with more specific standardization logic.

View Source
    def standardize_prefix_accession(self, accession: str) -> Tuple[str, str]:

        """

        Return (prefix, accession) in standardized form.

        This method defaults to returning `self.standard_prefix`

        (or `self.prefix_lower` if standard_prefix is not defined).

        Subclasses can override this method with more specific standardization logic.

        """

        standard_prefix = getattr(self, "standard_prefix", self.prefix_lower)

        standard_accession = accession

        return standard_prefix, standard_accession