Module manubot.cite.curie

Compact Uniform Resource Identifiers

Manubot keeps a local versions of the Bioregistry. Repository developers can run the following commands to update the Manubot version.

# regenerate manubot/cite/curie/bioregistry.json
python manubot/cite/curie/__init__.py
# if bioregistry.json has changed, the following test will likely fail:
pytest manubot/cite/tests/test_handlers.py::test_prefix_to_handler
# copy captured stdout from failed test_prefix_to_handler to
# manubot.cite.handlers.prefix_to_handler. Pre-commit hook will reformat file.

References:

https://bioregistry.io/
https://github.com/biopragmatics/bioregistry
https://en.wikipedia.org/wiki/CURIE
https://cthoyt.com/2021/10/07/biopragmatics-glossary.html
https://github.com/manubot/manubot/issues/305
https://identifiers.org/
https://github.com/manubot/manubot/issues/218
https://docs.identifiers.org/articles/api.html
https://n2t.net/e/compact_ids.html
https://n2t.net/e/cdl_ebi_prefixes.yaml
https://en.wikipedia.org/wiki/MIRIAM_Registry
Identifiers.org and MIRIAM Registry: community resources to provide persistent identification
On the road to robust data citation
Uniform Resolution of Compact Identifiers for Biomedical Data

View Source

"""

Compact Uniform Resource Identifiers

Manubot keeps a local versions of the Bioregistry.

Repository developers can run the following commands to update the Manubot version.

```shell

# regenerate manubot/cite/curie/bioregistry.json

python manubot/cite/curie/__init__.py

# if bioregistry.json has changed, the following test will likely fail:

pytest manubot/cite/tests/test_handlers.py::test_prefix_to_handler

# copy captured stdout from failed test_prefix_to_handler to

# manubot.cite.handlers.prefix_to_handler. Pre-commit hook will reformat file.

```

References:

- https://bioregistry.io/

- https://github.com/biopragmatics/bioregistry

- https://en.wikipedia.org/wiki/CURIE

- https://cthoyt.com/2021/10/07/biopragmatics-glossary.html

- https://github.com/manubot/manubot/issues/305

- https://identifiers.org/

- https://github.com/manubot/manubot/issues/218

- https://docs.identifiers.org/articles/api.html

- https://n2t.net/e/compact_ids.html

- https://n2t.net/e/cdl_ebi_prefixes.yaml

- https://en.wikipedia.org/wiki/MIRIAM_Registry

- [Identifiers.org and MIRIAM Registry: community resources to provide persistent identification](https://doi.org/10.1093/nar/gkr1097)

- [On the road to robust data citation](https://doi.org/10.1038/sdata.2018.95)

- [Uniform Resolution of Compact Identifiers for Biomedical Data](https://doi.org/10.1038/sdata.2018.29)

"""

import dataclasses

import functools

import json

import pathlib

import re

import typing

from manubot.cite.citekey import CiteKey

from manubot.cite.handlers import Handler

_keep_bioregistry_fields = {

    "deprecated",

    "example",

    "uri_format",

    "name",

    "pattern",

    "preferred_prefix",

    "synonyms",

}

bioregistry_path = pathlib.Path(__file__).parent.joinpath("bioregistry.json")

valid_prefix_pattern = re.compile(r"^[a-z0-9][a-z0-9._-]+?$")

"""

Ignore Bioregistry prefixes/synonyms that do not adhere to this pattern.

More permissive than the pattern at <https://github.com/biopragmatics/bioregistry/issues/158>,

because there are existing Bioregistry prefixes, even preferred prefixes, that are bad.

For example, starting with a number. We primarily care whether the prefix will work as part

of a pandoc citation key without requiring escaping.

"""

@dataclasses.dataclass

class Handler_CURIE(Handler):

    def __post_init__(self):

        try:

            self.resource = get_prefix_to_resource()[self.prefix_lower]

        except KeyError as err:

            raise ValueError(f"Unrecognized CURIE prefix {self.prefix_lower}") from err

        self.standard_prefix = (

            self.resource.get("preferred_prefix") or self.resource["prefix"]

        )

        self.prefixes = self.resource["all_prefixes"]

        if "pattern" in self.resource:

            self.accession_pattern = self.resource["pattern"]

    def get_csl_item(self, citekey: CiteKey):

        from ..url import get_url_csl_item

        url = self.get_url(accession=citekey.standard_accession)

        return get_url_csl_item(url)

    def inspect(self, citekey: CiteKey) -> typing.Optional[str]:

        pattern = self._get_pattern("accession_pattern")

        if pattern and not pattern.fullmatch(citekey.accession):

            return f"{citekey.accession} does not match regex {pattern.pattern}"

    def get_url(self, accession: str) -> str:

        if "uri_format" in self.resource:

            return self.resource["uri_format"].replace("$1", accession)

        return f"https://bioregistry.io/{self.standard_prefix}:{accession}"

def get_curie_handlers():

    """Get all possible CURIE handlers"""

    registries = get_bioregistry(compile_patterns=True)

    handlers = [Handler_CURIE(reg["prefix"]) for reg in registries]

    return handlers

def _download_bioregistry() -> None:

    """

    Download the Bioregistry consensus registry adding the following fields for each registry:

    - prefix: the standard lowercase registry prefix

    - all_prefixes: all distinct valid lowercase prefixes including synonyms

    """

    import requests

    url = "https://github.com/biopragmatics/bioregistry/raw/main/exports/registry/registry.json"

    response = requests.get(url)

    response.raise_for_status()

    results = response.json()

    assert isinstance(results, dict)

    registry = []

    for prefix, resource in results.items():

        assert isinstance(resource, dict)

        if not resource.get("uri_format"):

            # discard unresolvable prefixes

            continue

        for field in set(resource) - _keep_bioregistry_fields:

            del resource[field]

        resource["prefix"] = prefix

        all_prefixes = {

            prefix,

            *(x.lower() for x in resource.pop("synonyms", [])),

        }

        # remove invalid prefixes as per https://github.com/manubot/manubot/pull/306#discussion_r744125504

        resource["all_prefixes"] = sorted(

            filter(valid_prefix_pattern.fullmatch, all_prefixes)

        )

        registry.append(resource)

    json_text = json.dumps(registry, indent=2, ensure_ascii=False)

    bioregistry_path.write_text(json_text + "\n", encoding="utf-8")

def get_bioregistry(compile_patterns=False) -> dict:

    with bioregistry_path.open(encoding="utf-8-sig") as read_file:

        registry = json.load(read_file)

    assert isinstance(registry, list)

    if compile_patterns:

        for resource in registry:

            if "pattern" in resource:

                resource["compiled_pattern"] = re.compile(resource["pattern"])

    return registry

@functools.lru_cache

def get_prefix_to_resource() -> typing.Dict[str, typing.Dict]:

    prefix_to_resource = {}

    for resource in get_bioregistry():

        for prefix in resource["all_prefixes"]:

            prefix_to_resource[prefix] = resource

    return prefix_to_resource

def standardize_curie(curie: str) -> str:

    """

    Return CURIE with Bioregistry preferred prefix capitalization.

    `curie` should be in `prefix:accession` format.

    If `curie` is malformed or uses an unrecognized prefix, raise ValueError.

    """

    if not isinstance(curie, str):

        raise TypeError(

            f"curie parameter should be string. Received {curie.__class__.__name__} instead for {curie}"

        )

    try:

        prefix, accession = curie.split(":", 1)

    except ValueError as err:

        raise ValueError(

            f"curie must be splittable by `:` and formatted like `prefix:accession`. Received {curie}"

        ) from err

    handler = Handler_CURIE(prefix.lower())

    return f"{handler.standard_prefix}:{accession}"

def curie_to_url(curie: str) -> str:

    """

    `curie` should be in `prefix:accession` format

    """

    curie = standardize_curie(curie)

    prefix, accession = curie.split(":", 1)

    handler = Handler_CURIE(prefix.lower())

    return handler.get_url(accession)

if __name__ == "__main__":

    _download_bioregistry()

    bioregistry = get_bioregistry()

Variables

bioregistry_path

valid_prefix_pattern

Ignore Bioregistry prefixes/synonyms that do not adhere to this pattern.

More permissive than the pattern at https://github.com/biopragmatics/bioregistry/issues/158, because there are existing Bioregistry prefixes, even preferred prefixes, that are bad. For example, starting with a number. We primarily care whether the prefix will work as part of a pandoc citation key without requiring escaping.

Functions

curie_to_url

def curie_to_url(
    curie: str
) -> str

curie should be in prefix:accession format

View Source

def curie_to_url(curie: str) -> str:

    """

    `curie` should be in `prefix:accession` format

    """

    curie = standardize_curie(curie)

    prefix, accession = curie.split(":", 1)

    handler = Handler_CURIE(prefix.lower())

    return handler.get_url(accession)

get_bioregistry

def get_bioregistry(
    compile_patterns=False
) -> dict

View Source

def get_bioregistry(compile_patterns=False) -> dict:

    with bioregistry_path.open(encoding="utf-8-sig") as read_file:

        registry = json.load(read_file)

    assert isinstance(registry, list)

    if compile_patterns:

        for resource in registry:

            if "pattern" in resource:

                resource["compiled_pattern"] = re.compile(resource["pattern"])

    return registry

get_curie_handlers

def get_curie_handlers(

)

Get all possible CURIE handlers

View Source

def get_curie_handlers():

    """Get all possible CURIE handlers"""

    registries = get_bioregistry(compile_patterns=True)

    handlers = [Handler_CURIE(reg["prefix"]) for reg in registries]

    return handlers

get_prefix_to_resource

def get_prefix_to_resource(

) -> Dict[str, Dict]

View Source

@functools.lru_cache

def get_prefix_to_resource() -> typing.Dict[str, typing.Dict]:

    prefix_to_resource = {}

    for resource in get_bioregistry():

        for prefix in resource["all_prefixes"]:

            prefix_to_resource[prefix] = resource

    return prefix_to_resource

standardize_curie

def standardize_curie(
    curie: str
) -> str

Return CURIE with Bioregistry preferred prefix capitalization.

curie should be in prefix:accession format. If curie is malformed or uses an unrecognized prefix, raise ValueError.

View Source

def standardize_curie(curie: str) -> str:

    """

    Return CURIE with Bioregistry preferred prefix capitalization.

    `curie` should be in `prefix:accession` format.

    If `curie` is malformed or uses an unrecognized prefix, raise ValueError.

    """

    if not isinstance(curie, str):

        raise TypeError(

            f"curie parameter should be string. Received {curie.__class__.__name__} instead for {curie}"

        )

    try:

        prefix, accession = curie.split(":", 1)

    except ValueError as err:

        raise ValueError(

            f"curie must be splittable by `:` and formatted like `prefix:accession`. Received {curie}"

        ) from err

    handler = Handler_CURIE(prefix.lower())

    return f"{handler.standard_prefix}:{accession}"

Classes

Handler_CURIE

class Handler_CURIE(
    prefix_lower: str
)

Handler_CURIE(prefix_lower: str)

View Source

@dataclasses.dataclass

class Handler_CURIE(Handler):

    def __post_init__(self):

        try:

            self.resource = get_prefix_to_resource()[self.prefix_lower]

        except KeyError as err:

            raise ValueError(f"Unrecognized CURIE prefix {self.prefix_lower}") from err

        self.standard_prefix = (

            self.resource.get("preferred_prefix") or self.resource["prefix"]

        )

        self.prefixes = self.resource["all_prefixes"]

        if "pattern" in self.resource:

            self.accession_pattern = self.resource["pattern"]

    def get_csl_item(self, citekey: CiteKey):

        from ..url import get_url_csl_item

        url = self.get_url(accession=citekey.standard_accession)

        return get_url_csl_item(url)

    def inspect(self, citekey: CiteKey) -> typing.Optional[str]:

        pattern = self._get_pattern("accession_pattern")

        if pattern and not pattern.fullmatch(citekey.accession):

            return f"{citekey.accession} does not match regex {pattern.pattern}"

    def get_url(self, accession: str) -> str:

        if "uri_format" in self.resource:

            return self.resource["uri_format"].replace("$1", accession)

        return f"https://bioregistry.io/{self.standard_prefix}:{accession}"

Ancestors (in MRO)

manubot.cite.handlers.Handler

Class variables

prefixes

Methods

get_csl_item

def get_csl_item(
    self,
    citekey: manubot.cite.citekey.CiteKey
)

Return a CSL_Item with bibliographic details for citekey.

View Source

    def get_csl_item(self, citekey: CiteKey):

        from ..url import get_url_csl_item

        url = self.get_url(accession=citekey.standard_accession)

        return get_url_csl_item(url)

get_url

def get_url(
    self,
    accession: str
) -> str

View Source

    def get_url(self, accession: str) -> str:

        if "uri_format" in self.resource:

            return self.resource["uri_format"].replace("$1", accession)

        return f"https://bioregistry.io/{self.standard_prefix}:{accession}"

inspect

def inspect(
    self,
    citekey: manubot.cite.citekey.CiteKey
) -> Optional[str]

Check citekeys adhere to expected formats. If an issue is detected a

string describing the issue is returned. Otherwise returns None.

View Source

    def inspect(self, citekey: CiteKey) -> typing.Optional[str]:

        pattern = self._get_pattern("accession_pattern")

        if pattern and not pattern.fullmatch(citekey.accession):

            return f"{citekey.accession} does not match regex {pattern.pattern}"

standardize_prefix_accession

def standardize_prefix_accession(
    self,
    accession: str
) -> Tuple[str, str]

Return (prefix, accession) in standardized form.

This method defaults to returning self.standard_prefix (or self.prefix_lower if standard_prefix is not defined). Subclasses can override this method with more specific standardization logic.

View Source

    def standardize_prefix_accession(self, accession: str) -> Tuple[str, str]:

        """

        Return (prefix, accession) in standardized form.

        This method defaults to returning `self.standard_prefix`

        (or `self.prefix_lower` if standard_prefix is not defined).

        Subclasses can override this method with more specific standardization logic.

        """

        standard_prefix = getattr(self, "standard_prefix", self.prefix_lower)

        standard_accession = accession

        return standard_prefix, standard_accession