Module manubot.cite.arxiv
View Source
import logging
import re
import xml.etree.ElementTree
import requests
from manubot.util import get_manubot_user_agent
from .csl_item import CSL_Item
from .handlers import Handler
class Handler_arXiv(Handler):
standard_prefix = "arxiv"
prefixes = [
"arxiv",
]
accession_pattern = re.compile(
r"(?P<versionless_id>[0-9]{4}\.[0-9]{4,5}|[a-z\-]+(\.[A-Z]{2})?/[0-9]{7})(?P<version>v[0-9]+)?"
)
def inspect(self, citekey):
# https://arxiv.org/help/arxiv_identifier
if not self._get_pattern().fullmatch(citekey.accession):
return "arXiv identifiers must conform to syntax described at https://arxiv.org/help/arxiv_identifier."
def get_csl_item(self, citekey):
return get_arxiv_csl_item(citekey.standard_accession)
class CSL_Item_arXiv(CSL_Item):
def _set_invariant_fields(self):
# Set journal/publisher to arXiv
self["container-title"] = "arXiv"
self["publisher"] = "arXiv"
# Set CSL type to report for preprint
self["type"] = "report"
return self
def log_journal_doi(self, arxiv_id, journal_ref=None):
if "DOI" not in self:
return
msg = f"arXiv article {arxiv_id} published at https://doi.org/{self['DOI']}"
if journal_ref:
msg += f" — {journal_ref}"
logging.info(msg)
def set_identifier_fields(self, arxiv_id):
self.set_id(f"arxiv:{arxiv_id}")
self["URL"] = f"https://arxiv.org/abs/{arxiv_id}"
self["number"] = arxiv_id
_, version = split_arxiv_id_version(arxiv_id)
if version:
self["version"] = version
def split_arxiv_id_version(arxiv_id: str):
"""
Return (versionless_id, version) tuple.
Version refers to the verion suffix like 'v2' or None.
"""
match = re.match(Handler_arXiv.accession_pattern, arxiv_id)
return match.group("versionless_id"), match.group("version")
def get_arxiv_csl_item(arxiv_id: str):
"""
Return csl_item item for an arXiv identifier.
Chooses which arXiv API to use based on whether arxiv_id
is versioned, since only one endpoint supports versioning.
"""
_, version = split_arxiv_id_version(arxiv_id)
if version:
return get_arxiv_csl_item_export_api(arxiv_id)
return get_arxiv_csl_item_oai(arxiv_id)
def query_arxiv_api(url, params):
headers = {"User-Agent": get_manubot_user_agent()}
response = requests.get(url, params, headers=headers)
response.raise_for_status()
xml_tree = xml.etree.ElementTree.fromstring(response.text)
return xml_tree
def get_arxiv_csl_item_export_api(arxiv_id):
"""
Return csl_item item for an arXiv record.
arxiv_id can be versioned, like `1512.00567v2`, or versionless, like
`1512.00567`. If versionless, the arXiv API will return metadata for the
latest version. Legacy IDs, such as `cond-mat/0703470v2`, are also
supported.
If arXiv has an associated DOI for the record, a warning is logged to
alert the user that an alternative version of record exists.
References:
- https://arxiv.org/help/api/index
- http://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html
- https://github.com/citation-style-language/schema/blob/master/csl-data.json
"""
xml_tree = query_arxiv_api(
url="https://export.arxiv.org/api/query",
params={"id_list": arxiv_id, "max_results": 1},
)
# XML namespace prefixes
prefix = "{http://www.w3.org/2005/Atom}"
alt_prefix = "{http://arxiv.org/schemas/atom}"
# Parse XML
(entry,) = xml_tree.findall(prefix + "entry")
# Create dictionary for CSL Item
csl_item = CSL_Item_arXiv()
# Extract versioned arXiv ID
url = entry.findtext(prefix + "id")
pattern = re.compile(r"arxiv.org/abs/(.+)")
match = pattern.search(url)
versioned_id = match.group(1)
csl_item.set_identifier_fields(versioned_id)
# Extrat CSL title field
csl_item["title"] = entry.findtext(prefix + "title")
# Extract CSL date field
published = entry.findtext(prefix + "published")
csl_item.set_date(published, variable="issued")
# Extract authors
authors = []
for elem in entry.findall(prefix + "author"):
name = elem.findtext(prefix + "name")
author = {"literal": name}
authors.append(author)
csl_item["author"] = authors
csl_item._set_invariant_fields()
# Extract abstract
abstract = entry.findtext(prefix + "summary").strip()
if abstract:
# remove newlines that were added to wrap abstract
abstract = remove_newlines(abstract)
csl_item["abstract"] = abstract
# Check if the article has been published with a DOI
doi = entry.findtext(f"{alt_prefix}doi")
if doi:
csl_item["DOI"] = doi
journal_ref = entry.findtext(alt_prefix + "journal_ref")
csl_item.log_journal_doi(arxiv_id, journal_ref)
return csl_item
def get_arxiv_csl_item_oai(arxiv_id):
"""
Generate a CSL Item for an unversioned arXiv identifier
using arXiv's OAI_PMH v2.0 API <https://arxiv.org/help/oa>.
This endpoint does not support versioned `arxiv_id`.
"""
# XML namespace prefixes
ns_oai = "{http://www.openarchives.org/OAI/2.0/}"
ns_arxiv = "{http://arxiv.org/OAI/arXiv/}"
xml_tree = query_arxiv_api(
url="https://export.arxiv.org/oai2",
params={
"verb": "GetRecord",
"metadataPrefix": "arXiv",
"identifier": f"oai:arXiv.org:{arxiv_id}",
},
)
# Create dictionary for CSL Item
csl_item = CSL_Item_arXiv()
# Extract parent XML elements
(header_elem,) = xml_tree.findall(
f"{ns_oai}GetRecord/{ns_oai}record/{ns_oai}header"
)
(metadata_elem,) = xml_tree.findall(
f"{ns_oai}GetRecord/{ns_oai}record/{ns_oai}metadata"
)
(arxiv_elem,) = metadata_elem.findall(f"{ns_arxiv}arXiv")
# Set identifier fields
response_arxiv_id = arxiv_elem.findtext(f"{ns_arxiv}id")
if arxiv_id != response_arxiv_id:
logging.warning(
"arXiv oai2 query returned a different arxiv_id:"
f" {arxiv_id} became {response_arxiv_id}"
)
csl_item.set_identifier_fields(response_arxiv_id)
# Set title and date
title = arxiv_elem.findtext(f"{ns_arxiv}title")
if title:
csl_item["title"] = " ".join(title.split())
datestamp = header_elem.findtext(f"{ns_oai}datestamp")
csl_item.set_date(datestamp, "issued")
# Extract authors
author_elems = arxiv_elem.findall(f"{ns_arxiv}authors/{ns_arxiv}author")
authors = []
for author_elem in author_elems:
author = {}
given = author_elem.findtext(f"{ns_arxiv}forenames")
family = author_elem.findtext(f"{ns_arxiv}keyname")
if given:
author["given"] = given
if family:
author["family"] = family
authors.append(author)
csl_item["author"] = authors
csl_item._set_invariant_fields()
abstract = arxiv_elem.findtext(f"{ns_arxiv}abstract")
if abstract:
csl_item["abstract"] = remove_newlines(abstract)
license = arxiv_elem.findtext(f"{ns_arxiv}license")
if license:
csl_item.note_append_dict({"license": license})
doi = arxiv_elem.findtext(f"{ns_arxiv}doi")
if doi:
csl_item["DOI"] = doi
journal_ref = arxiv_elem.findtext(f"{ns_arxiv}journal-ref")
csl_item.log_journal_doi(arxiv_id, journal_ref)
return csl_item
def remove_newlines(text):
return re.sub(pattern=r"\n(?!\s)", repl=" ", string=text)
def get_arxiv_csl_item_zotero(arxiv_id):
"""
Generate CSL JSON Data for an arXiv ID using Zotero's translation-server.
"""
from manubot.cite.zotero import get_csl_item
return get_csl_item(f"arxiv:{arxiv_id}")
Functions
get_arxiv_csl_item
def get_arxiv_csl_item(
arxiv_id: str
)
Return csl_item item for an arXiv identifier.
Chooses which arXiv API to use based on whether arxiv_id is versioned, since only one endpoint supports versioning.
View Source
def get_arxiv_csl_item(arxiv_id: str):
"""
Return csl_item item for an arXiv identifier.
Chooses which arXiv API to use based on whether arxiv_id
is versioned, since only one endpoint supports versioning.
"""
_, version = split_arxiv_id_version(arxiv_id)
if version:
return get_arxiv_csl_item_export_api(arxiv_id)
return get_arxiv_csl_item_oai(arxiv_id)
get_arxiv_csl_item_export_api
def get_arxiv_csl_item_export_api(
arxiv_id
)
Return csl_item item for an arXiv record.
arxiv_id can be versioned, like 1512.00567v2
, or versionless, like
1512.00567
. If versionless, the arXiv API will return metadata for the
latest version. Legacy IDs, such as cond-mat/0703470v2
, are also
supported.
If arXiv has an associated DOI for the record, a warning is logged to alert the user that an alternative version of record exists.
References: - https://arxiv.org/help/api/index - http://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html - https://github.com/citation-style-language/schema/blob/master/csl-data.json
View Source
def get_arxiv_csl_item_export_api(arxiv_id):
"""
Return csl_item item for an arXiv record.
arxiv_id can be versioned, like `1512.00567v2`, or versionless, like
`1512.00567`. If versionless, the arXiv API will return metadata for the
latest version. Legacy IDs, such as `cond-mat/0703470v2`, are also
supported.
If arXiv has an associated DOI for the record, a warning is logged to
alert the user that an alternative version of record exists.
References:
- https://arxiv.org/help/api/index
- http://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html
- https://github.com/citation-style-language/schema/blob/master/csl-data.json
"""
xml_tree = query_arxiv_api(
url="https://export.arxiv.org/api/query",
params={"id_list": arxiv_id, "max_results": 1},
)
# XML namespace prefixes
prefix = "{http://www.w3.org/2005/Atom}"
alt_prefix = "{http://arxiv.org/schemas/atom}"
# Parse XML
(entry,) = xml_tree.findall(prefix + "entry")
# Create dictionary for CSL Item
csl_item = CSL_Item_arXiv()
# Extract versioned arXiv ID
url = entry.findtext(prefix + "id")
pattern = re.compile(r"arxiv.org/abs/(.+)")
match = pattern.search(url)
versioned_id = match.group(1)
csl_item.set_identifier_fields(versioned_id)
# Extrat CSL title field
csl_item["title"] = entry.findtext(prefix + "title")
# Extract CSL date field
published = entry.findtext(prefix + "published")
csl_item.set_date(published, variable="issued")
# Extract authors
authors = []
for elem in entry.findall(prefix + "author"):
name = elem.findtext(prefix + "name")
author = {"literal": name}
authors.append(author)
csl_item["author"] = authors
csl_item._set_invariant_fields()
# Extract abstract
abstract = entry.findtext(prefix + "summary").strip()
if abstract:
# remove newlines that were added to wrap abstract
abstract = remove_newlines(abstract)
csl_item["abstract"] = abstract
# Check if the article has been published with a DOI
doi = entry.findtext(f"{alt_prefix}doi")
if doi:
csl_item["DOI"] = doi
journal_ref = entry.findtext(alt_prefix + "journal_ref")
csl_item.log_journal_doi(arxiv_id, journal_ref)
return csl_item
get_arxiv_csl_item_oai
def get_arxiv_csl_item_oai(
arxiv_id
)
Generate a CSL Item for an unversioned arXiv identifier
using arXiv's OAI_PMH v2.0 API https://arxiv.org/help/oa.
This endpoint does not support versioned arxiv_id
.
View Source
def get_arxiv_csl_item_oai(arxiv_id):
"""
Generate a CSL Item for an unversioned arXiv identifier
using arXiv's OAI_PMH v2.0 API <https://arxiv.org/help/oa>.
This endpoint does not support versioned `arxiv_id`.
"""
# XML namespace prefixes
ns_oai = "{http://www.openarchives.org/OAI/2.0/}"
ns_arxiv = "{http://arxiv.org/OAI/arXiv/}"
xml_tree = query_arxiv_api(
url="https://export.arxiv.org/oai2",
params={
"verb": "GetRecord",
"metadataPrefix": "arXiv",
"identifier": f"oai:arXiv.org:{arxiv_id}",
},
)
# Create dictionary for CSL Item
csl_item = CSL_Item_arXiv()
# Extract parent XML elements
(header_elem,) = xml_tree.findall(
f"{ns_oai}GetRecord/{ns_oai}record/{ns_oai}header"
)
(metadata_elem,) = xml_tree.findall(
f"{ns_oai}GetRecord/{ns_oai}record/{ns_oai}metadata"
)
(arxiv_elem,) = metadata_elem.findall(f"{ns_arxiv}arXiv")
# Set identifier fields
response_arxiv_id = arxiv_elem.findtext(f"{ns_arxiv}id")
if arxiv_id != response_arxiv_id:
logging.warning(
"arXiv oai2 query returned a different arxiv_id:"
f" {arxiv_id} became {response_arxiv_id}"
)
csl_item.set_identifier_fields(response_arxiv_id)
# Set title and date
title = arxiv_elem.findtext(f"{ns_arxiv}title")
if title:
csl_item["title"] = " ".join(title.split())
datestamp = header_elem.findtext(f"{ns_oai}datestamp")
csl_item.set_date(datestamp, "issued")
# Extract authors
author_elems = arxiv_elem.findall(f"{ns_arxiv}authors/{ns_arxiv}author")
authors = []
for author_elem in author_elems:
author = {}
given = author_elem.findtext(f"{ns_arxiv}forenames")
family = author_elem.findtext(f"{ns_arxiv}keyname")
if given:
author["given"] = given
if family:
author["family"] = family
authors.append(author)
csl_item["author"] = authors
csl_item._set_invariant_fields()
abstract = arxiv_elem.findtext(f"{ns_arxiv}abstract")
if abstract:
csl_item["abstract"] = remove_newlines(abstract)
license = arxiv_elem.findtext(f"{ns_arxiv}license")
if license:
csl_item.note_append_dict({"license": license})
doi = arxiv_elem.findtext(f"{ns_arxiv}doi")
if doi:
csl_item["DOI"] = doi
journal_ref = arxiv_elem.findtext(f"{ns_arxiv}journal-ref")
csl_item.log_journal_doi(arxiv_id, journal_ref)
return csl_item
get_arxiv_csl_item_zotero
def get_arxiv_csl_item_zotero(
arxiv_id
)
Generate CSL JSON Data for an arXiv ID using Zotero's translation-server.
View Source
def get_arxiv_csl_item_zotero(arxiv_id):
"""
Generate CSL JSON Data for an arXiv ID using Zotero's translation-server.
"""
from manubot.cite.zotero import get_csl_item
return get_csl_item(f"arxiv:{arxiv_id}")
query_arxiv_api
def query_arxiv_api(
url,
params
)
View Source
def query_arxiv_api(url, params):
headers = {"User-Agent": get_manubot_user_agent()}
response = requests.get(url, params, headers=headers)
response.raise_for_status()
xml_tree = xml.etree.ElementTree.fromstring(response.text)
return xml_tree
remove_newlines
def remove_newlines(
text
)
View Source
def remove_newlines(text):
return re.sub(pattern=r"\n(?!\s)", repl=" ", string=text)
split_arxiv_id_version
def split_arxiv_id_version(
arxiv_id: str
)
Return (versionless_id, version) tuple.
Version refers to the verion suffix like 'v2' or None.
View Source
def split_arxiv_id_version(arxiv_id: str):
"""
Return (versionless_id, version) tuple.
Version refers to the verion suffix like 'v2' or None.
"""
match = re.match(Handler_arXiv.accession_pattern, arxiv_id)
return match.group("versionless_id"), match.group("version")
Classes
CSL_Item_arXiv
class CSL_Item_arXiv(
dictionary=None,
**kwargs
)
CSL_Item represents bibliographic information for a single citeable work.
On a technical side CSL_Item is a Python dictionary with extra methods that help cleaning and manipulating it.
These methods relate to:
- adding an id
key and value for CSL item
- correcting bibliographic information and its structure
- adding and reading a custom note to CSL item
More information on CSL JSON (a list of CSL_Items) is available at: - https://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html - http://docs.citationstyles.org/en/1.0.1/specification.html#standard-variables - https://github.com/citation-style-language/schema/blob/master/csl-data.json
View Source
class CSL_Item_arXiv(CSL_Item):
def _set_invariant_fields(self):
# Set journal/publisher to arXiv
self["container-title"] = "arXiv"
self["publisher"] = "arXiv"
# Set CSL type to report for preprint
self["type"] = "report"
return self
def log_journal_doi(self, arxiv_id, journal_ref=None):
if "DOI" not in self:
return
msg = f"arXiv article {arxiv_id} published at https://doi.org/{self['DOI']}"
if journal_ref:
msg += f" — {journal_ref}"
logging.info(msg)
def set_identifier_fields(self, arxiv_id):
self.set_id(f"arxiv:{arxiv_id}")
self["URL"] = f"https://arxiv.org/abs/{arxiv_id}"
self["number"] = arxiv_id
_, version = split_arxiv_id_version(arxiv_id)
if version:
self["version"] = version
Ancestors (in MRO)
- manubot.cite.csl_item.CSL_Item
- builtins.dict
Class variables
type_mapping
Instance variables
note
Return the value of the "note" field as a string.
If "note" key is not set, return empty string.
note_dict
Return a dictionary with key-value pairs encoded by this CSL Item's note.
Extracts both forms (line-entry and braced-entry) of key-value pairs from the CSL JSON "cheater syntax" https://github.com/Juris-M/citeproc-js-docs/blob/93d7991d42b4a96b74b7281f38e168e365847e40/csl-json/markup.rst#cheater-syntax-for-odd-fields
Assigning to this dict will not update self["note"]
.
Methods
clean
def clean(
self,
prune: bool = True
) -> 'CSL_Item'
Sanitize and touch-up a potentially dirty CSL_Item.
The following steps are performed: - update incorrect values for "type" field when a correct variant is known - remove fields that violate the JSON Schema (if prune=True) - set default value for "type" if missing, since CSL JSON requires type - validate against the CSL JSON schema (if prune=True) to ensure output CSL_Item is clean
View Source
def clean(self, prune: bool = True) -> "CSL_Item":
"""
Sanitize and touch-up a potentially dirty CSL_Item.
The following steps are performed:
- update incorrect values for "type" field when a correct variant is known
- remove fields that violate the JSON Schema (if prune=True)
- set default value for "type" if missing, since CSL JSON requires type
- validate against the CSL JSON schema (if prune=True) to ensure output
CSL_Item is clean
"""
logging.debug(
f"Starting CSL_Item.clean with{'' if prune else 'out'}"
f"CSL pruning for id: {self.get('id', 'id not specified')}"
)
self.correct_invalid_type()
if prune:
self.prune_against_schema()
self.set_default_type()
if prune:
self.validate_against_schema()
return self
clear
def clear(
...
)
D.clear() -> None. Remove all items from D.
copy
def copy(
...
)
D.copy() -> a shallow copy of D
correct_invalid_type
def correct_invalid_type(
self
) -> 'CSL_Item'
Correct invalid CSL item type.
Does nothing if type
not present.
For detail see https://github.com/CrossRef/rest-api-doc/issues/187
View Source
def correct_invalid_type(self) -> "CSL_Item":
"""
Correct invalid CSL item type.
Does nothing if `type` not present.
For detail see https://github.com/CrossRef/rest-api-doc/issues/187
"""
if "type" in self:
# Replace a type from in CSL_Item.type_mapping.keys(),
# leave type intact in other cases.
t = self["type"]
self["type"] = self.type_mapping.get(t, t)
return self
fromkeys
def fromkeys(
iterable,
value=None,
/
)
Create a new dictionary with keys from iterable and values set to value.
get
def get(
self,
key,
default=None,
/
)
Return the value for key if key is in the dictionary, else default.
get_date
def get_date(
self,
variable: str = 'issued',
fill: bool = False
) -> Optional[str]
Return a CSL date-variable as ISO formatted string:
('YYYY', 'YYYY-MM', 'YYYY-MM-DD', or None).
variable: which CSL JSON date variable to retrieve fill: if True, set missing months to January and missing days to the first day of the month.
View Source
def get_date(self, variable: str = "issued", fill: bool = False) -> Optional[str]:
"""
Return a CSL date-variable as ISO formatted string:
('YYYY', 'YYYY-MM', 'YYYY-MM-DD', or None).
variable: which CSL JSON date variable to retrieve
fill: if True, set missing months to January
and missing days to the first day of the month.
"""
try:
date_parts = self[variable]["date-parts"][0]
except (IndexError, KeyError):
return None
return date_parts_to_string(date_parts, fill=fill)
infer_id
def infer_id(
self
) -> 'CSL_Item'
Detect and set a non-null/empty value for "id" or else raise a ValueError.
View Source
def infer_id(self) -> "CSL_Item":
"""
Detect and set a non-null/empty value for "id" or else raise a ValueError.
"""
if self.get("standard_citation"):
# "standard_citation" field is set with a non-null/empty value
return self.set_id(self.pop("standard_citation"))
if self.note_dict.get("standard_id"):
# "standard_id" note field is set with a non-null/empty value
return self.set_id(self.note_dict["standard_id"])
if self.get("id"):
# "id" field exists and is set with a non-null/empty value
return self.set_id(self["id"])
raise ValueError(
"infer_id could not detect a field with a citation / standard_citation. "
'Consider setting the CSL Item "id" field.'
)
items
def items(
...
)
D.items() -> a set-like object providing a view on D's items
keys
def keys(
...
)
D.keys() -> a set-like object providing a view on D's keys
log_journal_doi
def log_journal_doi(
self,
arxiv_id,
journal_ref=None
)
View Source
def log_journal_doi(self, arxiv_id, journal_ref=None):
if "DOI" not in self:
return
msg = f"arXiv article {arxiv_id} published at https://doi.org/{self['DOI']}"
if journal_ref:
msg += f" — {journal_ref}"
logging.info(msg)
note_append_dict
def note_append_dict(
self,
dictionary: dict
) -> None
Append key-value pairs specified by dictionary
to the note field of a CSL Item.
Uses the the CSL JSON "cheater syntax" to encode additional values not defined by the CSL JSON schema.
View Source
def note_append_dict(self, dictionary: dict) -> None:
"""
Append key-value pairs specified by `dictionary` to the note field of a CSL Item.
Uses the the [CSL JSON "cheater syntax"](https://github.com/Juris-M/citeproc-js-docs/blob/93d7991d42b4a96b74b7281f38e168e365847e40/csl-json/markup.rst#cheater-syntax-for-odd-fields)
to encode additional values not defined by the CSL JSON schema.
"""
for key, value in dictionary.items():
if not re.fullmatch(r"[A-Z]+|[-_a-z]+", key):
logging.warning(
f"note_append_dict: skipping adding {key!r} because "
f"it does not conform to the variable_name syntax as per https://git.io/fjTzW."
)
continue
if "\n" in value:
logging.warning(
f"note_append_dict: skipping adding {key!r} because "
f"the value contains a newline: {value!r}"
)
continue
self.note_append_text(f"{key}: {value}")
note_append_text
def note_append_text(
self,
text: str
) -> None
Append text to the note field (as a new line) of a CSL Item.
If a line already exists equal to text
, do nothing.
View Source
def note_append_text(self, text: str) -> None:
"""
Append text to the note field (as a new line) of a CSL Item.
If a line already exists equal to `text`, do nothing.
"""
if not text:
return
note = self.note
if re.search(f"^{re.escape(text)}$", note, flags=re.MULTILINE):
# do not accumulate duplicate lines of text
# https://github.com/manubot/manubot/issues/258
return
if note and not note.endswith("\n"):
note += "\n"
note += text
self.note = note
pop
def pop(
...
)
D.pop(k[,d]) -> v, remove specified key and return the corresponding value.
If the key is not found, return the default if given; otherwise, raise a KeyError.
popitem
def popitem(
self,
/
)
Remove and return a (key, value) pair as a 2-tuple.
Pairs are returned in LIFO (last-in, first-out) order. Raises KeyError if the dict is empty.
prune_against_schema
def prune_against_schema(
self
) -> 'CSL_Item'
Remove fields that violate the CSL Item JSON Schema.
View Source
def prune_against_schema(self) -> "CSL_Item":
"""
Remove fields that violate the CSL Item JSON Schema.
"""
from .citeproc import remove_jsonschema_errors
(csl_item,) = remove_jsonschema_errors([self], in_place=True)
assert csl_item is self
return self
set_date
def set_date(
self,
date: Union[NoneType, str, datetime.date, datetime.datetime],
variable: str = 'issued'
) -> 'CSL_Item'
date: date either as a string (in the form YYYY, YYYY-MM, or YYYY-MM-DD)
or as a Python date object (datetime.date or datetime.datetime). variable: which variable to assign the date to.
View Source
def set_date(
self,
date: Union[None, str, datetime.date, datetime.datetime],
variable: str = "issued",
) -> "CSL_Item":
"""
date: date either as a string (in the form YYYY, YYYY-MM, or YYYY-MM-DD)
or as a Python date object (datetime.date or datetime.datetime).
variable: which variable to assign the date to.
"""
date_parts = date_to_date_parts(date)
if date_parts:
self[variable] = {"date-parts": [date_parts]}
return self
set_default_type
def set_default_type(
self
) -> 'CSL_Item'
Set type to 'entry', if type not specified.
View Source
def set_default_type(self) -> "CSL_Item":
"""
Set type to 'entry', if type not specified.
"""
self["type"] = self.get("type", "entry")
return self
set_id
def set_id(
self,
id_
) -> 'CSL_Item'
View Source
def set_id(self, id_) -> "CSL_Item":
self["id"] = id_
return self
set_identifier_fields
def set_identifier_fields(
self,
arxiv_id
)
View Source
def set_identifier_fields(self, arxiv_id):
self.set_id(f"arxiv:{arxiv_id}")
self["URL"] = f"https://arxiv.org/abs/{arxiv_id}"
self["number"] = arxiv_id
_, version = split_arxiv_id_version(arxiv_id)
if version:
self["version"] = version
setdefault
def setdefault(
self,
key,
default=None,
/
)
Insert key with a value of default if key is not in the dictionary.
Return the value for key if key is in the dictionary, else default.
standardize_id
def standardize_id(
self
) -> 'CSL_Item'
Extract the standard_id (standard citation key) for a csl_item and modify the csl_item in-place to set its "id" field.
The standard_id is extracted from a "standard_citation" field, the "note" field, or the "id" field. The extracted citation is checked for validity and standardized, after which it is the final "standard_id".
Regarding csl_item modification, the csl_item "id" field is set to the standard_citation and the note field is created or updated with key-value pairs for standard_id and original_id.
Note that the Manubot software generally refers to the "id" of a CSL Item as a citekey. However, in this context, we use "id" rather than "citekey" for consistency with CSL's "id" field.
View Source
def standardize_id(self) -> "CSL_Item":
"""
Extract the standard_id (standard citation key) for a csl_item and modify the csl_item in-place to set its "id" field.
The standard_id is extracted from a "standard_citation" field, the "note" field, or the "id" field.
The extracted citation is checked for validity and standardized, after which it is the final "standard_id".
Regarding csl_item modification, the csl_item "id" field is set to the standard_citation and the note field
is created or updated with key-value pairs for standard_id and original_id.
Note that the Manubot software generally refers to the "id" of a CSL Item as a citekey.
However, in this context, we use "id" rather than "citekey" for consistency with CSL's "id" field.
"""
original_id = self.get("id")
self.infer_id()
original_standard_id = self["id"]
citekey = CiteKey(original_standard_id)
standard_id = citekey.standard_id
add_to_note = {}
note_dict = self.note_dict
if original_id and original_id != standard_id:
if original_id != note_dict.get("original_id"):
add_to_note["original_id"] = original_id
if original_standard_id and original_standard_id != standard_id:
if original_standard_id != note_dict.get("original_standard_id"):
add_to_note["original_standard_id"] = original_standard_id
if standard_id != note_dict.get("standard_id"):
add_to_note["standard_id"] = standard_id
self.note_append_dict(dictionary=add_to_note)
self.set_id(standard_id)
return self
update
def update(
...
)
D.update([E, ]**F) -> None. Update D from dict/iterable E and F.
If E is present and has a .keys() method, then does: for k in E: D[k] = E[k] If E is present and lacks a .keys() method, then does: for k, v in E: D[k] = v In either case, this is followed by: for k in F: D[k] = F[k]
validate_against_schema
def validate_against_schema(
self
) -> 'CSL_Item'
Confirm that the CSL_Item validates. If not, raises a
jsonschema.exceptions.ValidationError.
View Source
def validate_against_schema(self) -> "CSL_Item":
"""
Confirm that the CSL_Item validates. If not, raises a
jsonschema.exceptions.ValidationError.
"""
from .citeproc import get_jsonschema_csl_validator
validator = get_jsonschema_csl_validator()
validator.validate([self])
return self
values
def values(
...
)
D.values() -> an object providing a view on D's values
Handler_arXiv
class Handler_arXiv(
prefix_lower: str
)
A Handler is a class that provides support for a certain type of citekey.
For example, a Handler subclass could provide support for DOI citekeys. Subclasses enable custom logic for different citekey prefixes, including how to standardize the citekey and how to retrieve CSL Item metadata.
View Source
class Handler_arXiv(Handler):
standard_prefix = "arxiv"
prefixes = [
"arxiv",
]
accession_pattern = re.compile(
r"(?P<versionless_id>[0-9]{4}\.[0-9]{4,5}|[a-z\-]+(\.[A-Z]{2})?/[0-9]{7})(?P<version>v[0-9]+)?"
)
def inspect(self, citekey):
# https://arxiv.org/help/arxiv_identifier
if not self._get_pattern().fullmatch(citekey.accession):
return "arXiv identifiers must conform to syntax described at https://arxiv.org/help/arxiv_identifier."
def get_csl_item(self, citekey):
return get_arxiv_csl_item(citekey.standard_accession)
Ancestors (in MRO)
- manubot.cite.handlers.Handler
Class variables
accession_pattern
prefixes
standard_prefix
Methods
get_csl_item
def get_csl_item(
self,
citekey
)
Return a CSL_Item with bibliographic details for citekey.
View Source
def get_csl_item(self, citekey):
return get_arxiv_csl_item(citekey.standard_accession)
inspect
def inspect(
self,
citekey
)
Check citekeys adhere to expected formats. If an issue is detected a
string describing the issue is returned. Otherwise returns None.
View Source
def inspect(self, citekey):
# https://arxiv.org/help/arxiv_identifier
if not self._get_pattern().fullmatch(citekey.accession):
return "arXiv identifiers must conform to syntax described at https://arxiv.org/help/arxiv_identifier."
standardize_prefix_accession
def standardize_prefix_accession(
self,
accession: str
) -> Tuple[str, str]
Return (prefix, accession) in standardized form.
This method defaults to returning self.standard_prefix
(or self.prefix_lower
if standard_prefix is not defined).
Subclasses can override this method with more specific standardization logic.
View Source
def standardize_prefix_accession(self, accession: str) -> Tuple[str, str]:
"""
Return (prefix, accession) in standardized form.
This method defaults to returning `self.standard_prefix`
(or `self.prefix_lower` if standard_prefix is not defined).
Subclasses can override this method with more specific standardization logic.
"""
standard_prefix = getattr(self, "standard_prefix", self.prefix_lower)
standard_accession = accession
return standard_prefix, standard_accession