Skip to content

Module manubot.cite.url

View Source
import json

import logging

import re

from typing import Any, Dict

from .handlers import Handler

CSLItem = Dict[str, Any]

class Handler_URL(Handler):

    standard_prefix = "url"

    prefixes = [

        "url",

        "http",

        "https",

    ]

    def standardize_prefix_accession(self, accession):

        if self.prefix_lower != "url":

            accession = f"{self.prefix_lower}:{accession}"

        return self.standard_prefix, accession

    def get_csl_item(self, citekey):

        return get_url_csl_item(citekey.standard_accession)

def get_url_csl_item(url: str) -> CSLItem:

    """

    Get csl_item for a URL trying a sequence of strategies.

    This function uses a list of CSL JSON Item metadata retrievers, specified

    by the module-level variable `url_retrievers`. The methods are attempted

    in order, with this function returning the metadata from the first

    non-failing method.

    """

    for retriever in url_retrievers:

        try:

            return retriever(url)

        except Exception as error:

            logging.warning(

                f"Error in {retriever.__name__} for {url} "

                f"due to a {error.__class__.__name__}:\n{error}"

            )

            logging.info(error, exc_info=True)

    raise Exception(f"all get_url_csl_item methods failed for {url}")

def get_url_csl_item_zotero(url: str) -> CSLItem:

    """

    Use Zotero's translation-server to generate a CSL Item for the specified URL.

    """

    from manubot.cite.zotero import export_as_csl, web_query

    zotero_data = web_query(url)

    csl_data = export_as_csl(zotero_data)

    (csl_item,) = csl_data

    if not csl_item.get("URL"):

        # some Zotero translators don't set URL. https://github.com/manubot/manubot/issues/244

        csl_item["URL"] = url

    return csl_item

def get_url_csl_item_greycite(url: str) -> CSLItem:

    """

    Uses Greycite which has experiened uptime problems in the past.

    API calls seem to take at least 15 seconds. Browser requests are much

    faster. Setting header did not have an effect. Consider mimicking browser

    using selenium.

    More information on Greycite at:

    http://greycite.knowledgeblog.org/

    http://knowledgeblog.org/greycite

    https://arxiv.org/abs/1304.7151

    https://git.io/v9N2C

    """

    import requests

    from manubot.util import get_manubot_user_agent

    headers = {

        "Connection": "close",  # https://github.com/kennethreitz/requests/issues/4023

        "User-Agent": get_manubot_user_agent(),

    }

    response = requests.get(

        "http://greycite.knowledgeblog.org/json", params={"uri": url}, headers=headers

    )

    response.raise_for_status()

    # Some Greycite responses were valid JSON besides for an error appended

    # like "<p>*** Date set from uri<p>" or "<p>*** fetch error : 404<p>".

    pattern = re.compile(r"<p>\*\*\*.*<p>")

    text = pattern.sub("", response.text)

    csl_item = json.loads(text)

    csl_item["type"] = "webpage"

    return csl_item

def get_url_csl_item_manual(url: str) -> CSLItem:

    """

    Manually create csl_item for a URL.

    """

    return {"URL": url, "type": "webpage"}

url_retrievers = [

    get_url_csl_item_zotero,

    get_url_csl_item_greycite,

    get_url_csl_item_manual,

]

Variables

CSLItem
url_retrievers

Functions

get_url_csl_item

def get_url_csl_item(
    url: str
) -> Dict[str, Any]

Get csl_item for a URL trying a sequence of strategies.

This function uses a list of CSL JSON Item metadata retrievers, specified by the module-level variable url_retrievers. The methods are attempted in order, with this function returning the metadata from the first non-failing method.

View Source
def get_url_csl_item(url: str) -> CSLItem:

    """

    Get csl_item for a URL trying a sequence of strategies.

    This function uses a list of CSL JSON Item metadata retrievers, specified

    by the module-level variable `url_retrievers`. The methods are attempted

    in order, with this function returning the metadata from the first

    non-failing method.

    """

    for retriever in url_retrievers:

        try:

            return retriever(url)

        except Exception as error:

            logging.warning(

                f"Error in {retriever.__name__} for {url} "

                f"due to a {error.__class__.__name__}:\n{error}"

            )

            logging.info(error, exc_info=True)

    raise Exception(f"all get_url_csl_item methods failed for {url}")

get_url_csl_item_greycite

def get_url_csl_item_greycite(
    url: str
) -> Dict[str, Any]

Uses Greycite which has experiened uptime problems in the past.

API calls seem to take at least 15 seconds. Browser requests are much faster. Setting header did not have an effect. Consider mimicking browser using selenium.

More information on Greycite at: http://greycite.knowledgeblog.org/ http://knowledgeblog.org/greycite https://arxiv.org/abs/1304.7151 https://git.io/v9N2C

View Source
def get_url_csl_item_greycite(url: str) -> CSLItem:

    """

    Uses Greycite which has experiened uptime problems in the past.

    API calls seem to take at least 15 seconds. Browser requests are much

    faster. Setting header did not have an effect. Consider mimicking browser

    using selenium.

    More information on Greycite at:

    http://greycite.knowledgeblog.org/

    http://knowledgeblog.org/greycite

    https://arxiv.org/abs/1304.7151

    https://git.io/v9N2C

    """

    import requests

    from manubot.util import get_manubot_user_agent

    headers = {

        "Connection": "close",  # https://github.com/kennethreitz/requests/issues/4023

        "User-Agent": get_manubot_user_agent(),

    }

    response = requests.get(

        "http://greycite.knowledgeblog.org/json", params={"uri": url}, headers=headers

    )

    response.raise_for_status()

    # Some Greycite responses were valid JSON besides for an error appended

    # like "<p>*** Date set from uri<p>" or "<p>*** fetch error : 404<p>".

    pattern = re.compile(r"<p>\*\*\*.*<p>")

    text = pattern.sub("", response.text)

    csl_item = json.loads(text)

    csl_item["type"] = "webpage"

    return csl_item

get_url_csl_item_manual

def get_url_csl_item_manual(
    url: str
) -> Dict[str, Any]

Manually create csl_item for a URL.

View Source
def get_url_csl_item_manual(url: str) -> CSLItem:

    """

    Manually create csl_item for a URL.

    """

    return {"URL": url, "type": "webpage"}

get_url_csl_item_zotero

def get_url_csl_item_zotero(
    url: str
) -> Dict[str, Any]

Use Zotero's translation-server to generate a CSL Item for the specified URL.

View Source
def get_url_csl_item_zotero(url: str) -> CSLItem:

    """

    Use Zotero's translation-server to generate a CSL Item for the specified URL.

    """

    from manubot.cite.zotero import export_as_csl, web_query

    zotero_data = web_query(url)

    csl_data = export_as_csl(zotero_data)

    (csl_item,) = csl_data

    if not csl_item.get("URL"):

        # some Zotero translators don't set URL. https://github.com/manubot/manubot/issues/244

        csl_item["URL"] = url

    return csl_item

Classes

Handler_URL

class Handler_URL(
    prefix_lower: str
)

A Handler is a class that provides support for a certain type of citekey.

For example, a Handler subclass could provide support for DOI citekeys. Subclasses enable custom logic for different citekey prefixes, including how to standardize the citekey and how to retrieve CSL Item metadata.

View Source
class Handler_URL(Handler):

    standard_prefix = "url"

    prefixes = [

        "url",

        "http",

        "https",

    ]

    def standardize_prefix_accession(self, accession):

        if self.prefix_lower != "url":

            accession = f"{self.prefix_lower}:{accession}"

        return self.standard_prefix, accession

    def get_csl_item(self, citekey):

        return get_url_csl_item(citekey.standard_accession)

Ancestors (in MRO)

  • manubot.cite.handlers.Handler

Class variables

prefixes
standard_prefix

Methods

get_csl_item

def get_csl_item(
    self,
    citekey
)

Return a CSL_Item with bibliographic details for citekey.

View Source
    def get_csl_item(self, citekey):

        return get_url_csl_item(citekey.standard_accession)

inspect

def inspect(
    self,
    citekey: manubot.cite.citekey.CiteKey
) -> Optional[str]

Check citekeys adhere to expected formats. If an issue is detected a

string describing the issue is returned. Otherwise returns None.

View Source
    def inspect(self, citekey: CiteKey) -> Optional[str]:

        """

        Check citekeys adhere to expected formats. If an issue is detected a

        string describing the issue is returned. Otherwise returns None.

        """

        pattern = self._get_pattern("accession_pattern")

        if not pattern:

            return None

        if not pattern.fullmatch(citekey.accession):

            return f"{citekey.accession} does not match regex {pattern.pattern}"

standardize_prefix_accession

def standardize_prefix_accession(
    self,
    accession
)

Return (prefix, accession) in standardized form.

This method defaults to returning self.standard_prefix (or self.prefix_lower if standard_prefix is not defined). Subclasses can override this method with more specific standardization logic.

View Source
    def standardize_prefix_accession(self, accession):

        if self.prefix_lower != "url":

            accession = f"{self.prefix_lower}:{accession}"

        return self.standard_prefix, accession