Module manubot.cite.url
View Source
import json
import logging
import re
from typing import Any, Dict
from .handlers import Handler
CSLItem = Dict[str, Any]
class Handler_URL(Handler):
standard_prefix = "url"
prefixes = [
"url",
"http",
"https",
]
def standardize_prefix_accession(self, accession):
if self.prefix_lower != "url":
accession = f"{self.prefix_lower}:{accession}"
return self.standard_prefix, accession
def get_csl_item(self, citekey):
return get_url_csl_item(citekey.standard_accession)
def get_url_csl_item(url: str) -> CSLItem:
"""
Get csl_item for a URL trying a sequence of strategies.
This function uses a list of CSL JSON Item metadata retrievers, specified
by the module-level variable `url_retrievers`. The methods are attempted
in order, with this function returning the metadata from the first
non-failing method.
"""
for retriever in url_retrievers:
try:
return retriever(url)
except Exception as error:
logging.warning(
f"Error in {retriever.__name__} for {url} "
f"due to a {error.__class__.__name__}:\n{error}"
)
logging.info(error, exc_info=True)
raise Exception(f"all get_url_csl_item methods failed for {url}")
def get_url_csl_item_zotero(url: str) -> CSLItem:
"""
Use Zotero's translation-server to generate a CSL Item for the specified URL.
"""
from manubot.cite.zotero import export_as_csl, web_query
zotero_data = web_query(url)
csl_data = export_as_csl(zotero_data)
(csl_item,) = csl_data
if not csl_item.get("URL"):
# some Zotero translators don't set URL. https://github.com/manubot/manubot/issues/244
csl_item["URL"] = url
return csl_item
def get_url_csl_item_greycite(url: str) -> CSLItem:
"""
Uses Greycite which has experiened uptime problems in the past.
API calls seem to take at least 15 seconds. Browser requests are much
faster. Setting header did not have an effect. Consider mimicking browser
using selenium.
More information on Greycite at:
http://greycite.knowledgeblog.org/
http://knowledgeblog.org/greycite
https://arxiv.org/abs/1304.7151
https://git.io/v9N2C
"""
import requests
from manubot.util import get_manubot_user_agent
headers = {
"Connection": "close", # https://github.com/kennethreitz/requests/issues/4023
"User-Agent": get_manubot_user_agent(),
}
response = requests.get(
"http://greycite.knowledgeblog.org/json", params={"uri": url}, headers=headers
)
response.raise_for_status()
# Some Greycite responses were valid JSON besides for an error appended
# like "<p>*** Date set from uri<p>" or "<p>*** fetch error : 404<p>".
pattern = re.compile(r"<p>\*\*\*.*<p>")
text = pattern.sub("", response.text)
csl_item = json.loads(text)
csl_item["type"] = "webpage"
return csl_item
def get_url_csl_item_manual(url: str) -> CSLItem:
"""
Manually create csl_item for a URL.
"""
return {"URL": url, "type": "webpage"}
url_retrievers = [
get_url_csl_item_zotero,
get_url_csl_item_greycite,
get_url_csl_item_manual,
]
Variables
CSLItem
url_retrievers
Functions
get_url_csl_item
def get_url_csl_item(
url: str
) -> Dict[str, Any]
Get csl_item for a URL trying a sequence of strategies.
This function uses a list of CSL JSON Item metadata retrievers, specified
by the module-level variable url_retrievers
. The methods are attempted
in order, with this function returning the metadata from the first
non-failing method.
View Source
def get_url_csl_item(url: str) -> CSLItem:
"""
Get csl_item for a URL trying a sequence of strategies.
This function uses a list of CSL JSON Item metadata retrievers, specified
by the module-level variable `url_retrievers`. The methods are attempted
in order, with this function returning the metadata from the first
non-failing method.
"""
for retriever in url_retrievers:
try:
return retriever(url)
except Exception as error:
logging.warning(
f"Error in {retriever.__name__} for {url} "
f"due to a {error.__class__.__name__}:\n{error}"
)
logging.info(error, exc_info=True)
raise Exception(f"all get_url_csl_item methods failed for {url}")
get_url_csl_item_greycite
def get_url_csl_item_greycite(
url: str
) -> Dict[str, Any]
Uses Greycite which has experiened uptime problems in the past.
API calls seem to take at least 15 seconds. Browser requests are much faster. Setting header did not have an effect. Consider mimicking browser using selenium.
More information on Greycite at: http://greycite.knowledgeblog.org/ http://knowledgeblog.org/greycite https://arxiv.org/abs/1304.7151 https://git.io/v9N2C
View Source
def get_url_csl_item_greycite(url: str) -> CSLItem:
"""
Uses Greycite which has experiened uptime problems in the past.
API calls seem to take at least 15 seconds. Browser requests are much
faster. Setting header did not have an effect. Consider mimicking browser
using selenium.
More information on Greycite at:
http://greycite.knowledgeblog.org/
http://knowledgeblog.org/greycite
https://arxiv.org/abs/1304.7151
https://git.io/v9N2C
"""
import requests
from manubot.util import get_manubot_user_agent
headers = {
"Connection": "close", # https://github.com/kennethreitz/requests/issues/4023
"User-Agent": get_manubot_user_agent(),
}
response = requests.get(
"http://greycite.knowledgeblog.org/json", params={"uri": url}, headers=headers
)
response.raise_for_status()
# Some Greycite responses were valid JSON besides for an error appended
# like "<p>*** Date set from uri<p>" or "<p>*** fetch error : 404<p>".
pattern = re.compile(r"<p>\*\*\*.*<p>")
text = pattern.sub("", response.text)
csl_item = json.loads(text)
csl_item["type"] = "webpage"
return csl_item
get_url_csl_item_manual
def get_url_csl_item_manual(
url: str
) -> Dict[str, Any]
Manually create csl_item for a URL.
View Source
def get_url_csl_item_manual(url: str) -> CSLItem:
"""
Manually create csl_item for a URL.
"""
return {"URL": url, "type": "webpage"}
get_url_csl_item_zotero
def get_url_csl_item_zotero(
url: str
) -> Dict[str, Any]
Use Zotero's translation-server to generate a CSL Item for the specified URL.
View Source
def get_url_csl_item_zotero(url: str) -> CSLItem:
"""
Use Zotero's translation-server to generate a CSL Item for the specified URL.
"""
from manubot.cite.zotero import export_as_csl, web_query
zotero_data = web_query(url)
csl_data = export_as_csl(zotero_data)
(csl_item,) = csl_data
if not csl_item.get("URL"):
# some Zotero translators don't set URL. https://github.com/manubot/manubot/issues/244
csl_item["URL"] = url
return csl_item
Classes
Handler_URL
class Handler_URL(
prefix_lower: str
)
A Handler is a class that provides support for a certain type of citekey.
For example, a Handler subclass could provide support for DOI citekeys. Subclasses enable custom logic for different citekey prefixes, including how to standardize the citekey and how to retrieve CSL Item metadata.
View Source
class Handler_URL(Handler):
standard_prefix = "url"
prefixes = [
"url",
"http",
"https",
]
def standardize_prefix_accession(self, accession):
if self.prefix_lower != "url":
accession = f"{self.prefix_lower}:{accession}"
return self.standard_prefix, accession
def get_csl_item(self, citekey):
return get_url_csl_item(citekey.standard_accession)
Ancestors (in MRO)
- manubot.cite.handlers.Handler
Class variables
prefixes
standard_prefix
Methods
get_csl_item
def get_csl_item(
self,
citekey
)
Return a CSL_Item with bibliographic details for citekey.
View Source
def get_csl_item(self, citekey):
return get_url_csl_item(citekey.standard_accession)
inspect
def inspect(
self,
citekey: manubot.cite.citekey.CiteKey
) -> Optional[str]
Check citekeys adhere to expected formats. If an issue is detected a
string describing the issue is returned. Otherwise returns None.
View Source
def inspect(self, citekey: CiteKey) -> Optional[str]:
"""
Check citekeys adhere to expected formats. If an issue is detected a
string describing the issue is returned. Otherwise returns None.
"""
pattern = self._get_pattern("accession_pattern")
if not pattern:
return None
if not pattern.fullmatch(citekey.accession):
return f"{citekey.accession} does not match regex {pattern.pattern}"
standardize_prefix_accession
def standardize_prefix_accession(
self,
accession
)
Return (prefix, accession) in standardized form.
This method defaults to returning self.standard_prefix
(or self.prefix_lower
if standard_prefix is not defined).
Subclasses can override this method with more specific standardization logic.
View Source
def standardize_prefix_accession(self, accession):
if self.prefix_lower != "url":
accession = f"{self.prefix_lower}:{accession}"
return self.standard_prefix, accession