Module manubot.process.metadata
Tools for manuscript metadata processing including thumbnail detection and processing.
View Source
"""
Tools for manuscript metadata processing including thumbnail detection and processing.
"""
import functools
import logging
import pathlib
import subprocess
from typing import Optional
from urllib.parse import urljoin
def get_header_includes(variables: dict) -> str:
"""
Render `header-includes-template.html` using information from `variables`.
"""
from .util import template_with_jinja2
path = pathlib.Path(__file__).parent.joinpath("header-includes-template.html")
try:
template = path.read_text(encoding="utf-8-sig")
return template_with_jinja2(template, variables)
except Exception:
logging.exception("Error generating header-includes.")
return ""
def get_thumbnail_url(thumbnail=None):
"""
Starting with a user-specified `thumbnail` as either a path, URL, or None,
return an absolute URL pointing to the thumbnail image. If the provided `thumbnail`
is a URL, return this URL unmodified. If `thumbnail` is None, search for `thumbnail.png`
within the git repository from which this function is executed. If `thumbnail`
is a local path, the path should be relative to root directory of the git repository
it is located in. If a local path is provided or detected,
it is converted to a GitHub raw URL.
"""
from manubot.util import is_http_url
if not thumbnail:
message = "get_thumbnail_url: thumbnail location not explicitly provided. "
thumbnail = _find_thumbnail_path()
message += (
f"Thumbnail detected at {thumbnail!r}"
if thumbnail
else "No local thumbnail detected"
)
logging.debug(message)
elif is_http_url(thumbnail):
logging.debug("provided thumbnail is a URL. Pass it through.")
return thumbnail
return _thumbnail_path_to_url(thumbnail)
def _find_thumbnail_path():
"""
If this this function is executed with a working directory that is inside a git repository,
return the path to a `thumbnail.png` file located anywhere in that repository. Otherwise,
return `None`.
"""
directory = git_repository_root()
if not directory:
return None
paths = directory.glob("**/thumbnail.png")
paths = [path.relative_to(directory) for path in paths]
paths = sorted(paths, key=lambda x: (len(x.parents), x))
if not paths:
return None
return paths[0].as_posix()
def _thumbnail_path_to_url(path):
"""
Convert a local thumbnail path (string) to an absolute URL using the GitHub
repository location detected using `get_continuous_integration_parameters`.
"""
if not path:
return None
from .ci import get_continuous_integration_parameters
info = get_continuous_integration_parameters()
try:
url = f"https://github.com/{info['repo_slug']}/raw/{info['triggering_commit']}/{path}"
except (TypeError, KeyError):
return None
return url
def get_head_commit() -> Optional[str]:
from manubot.util import shlex_join
args = ["git", "rev-parse", "HEAD"]
try:
return subprocess.check_output(args, stderr=subprocess.PIPE, text=True).strip()
except subprocess.CalledProcessError as error:
logging.warning(
f"get_head_commit: {shlex_join(error.cmd)!r} returned exit code {error.returncode} "
f"with the following stdout:\n{error.stdout}\n"
f"And the following stderr:\n{error.stderr}"
)
return None
@functools.lru_cache
def git_repository_root():
"""
Return the path to repository root directory or `None` if indeterminate.
"""
for cmd in (
["git", "rev-parse", "--show-superproject-working-tree"],
["git", "rev-parse", "--show-toplevel"],
):
try:
path = subprocess.check_output(cmd, text=True).rstrip("\r\n")
if path:
return pathlib.Path(path)
except (subprocess.CalledProcessError, OSError):
pass
return None
def get_manuscript_urls(html_url: Optional[str] = None) -> dict:
"""
Return a dictionary with URLs for a manuscript.
An example for a manuscript where all URLs get set, inferred from continuous integration environment variables, is:
```python
{
"html_url": "https://manubot.github.io/rootstock/",
"pdf_url": "https://manubot.github.io/rootstock/manuscript.pdf",
"html_url_versioned": "https://manubot.github.io/rootstock/v/7cf9071212ce33116ad09cf2237a370b180a3c35/",
"pdf_url_versioned": "https://manubot.github.io/rootstock/v/7cf9071212ce33116ad09cf2237a370b180a3c35/manuscript.pdf",
}
```
Provide `html_url` to set a custom domain.
If `html_url="https://git.dhimmel.com/bitcoin-whitepaper/"`,
the return dictionary will be like:
```python
{
"html_url": "https://git.dhimmel.com/bitcoin-whitepaper/",
"pdf_url": "https://git.dhimmel.com/bitcoin-whitepaper/manuscript.pdf",
"html_url_versioned": "https://git.dhimmel.com/bitcoin-whitepaper/v/cb1f2c12eec8b56db9ef5f641ec805e2d449d319/",
"pdf_url_versioned": "https://git.dhimmel.com/bitcoin-whitepaper/v/cb1f2c12eec8b56db9ef5f641ec805e2d449d319/manuscript.pdf",
}
```
Note the trailing `/` in `html_url`, which is required for proper functioning.
"""
import requests
from .ci import get_continuous_integration_parameters
urls = {}
ci_params = get_continuous_integration_parameters()
if html_url is None:
if not ci_params:
return urls
html_url = "https://{repo_owner}.github.io/{repo_name}/".format(**ci_params)
urls["html_url"] = html_url
urls["pdf_url"] = urljoin(html_url, "manuscript.pdf")
if not ci_params:
return urls
urls["html_url_versioned"] = urljoin(html_url, "v/{commit}/".format(**ci_params))
urls["pdf_url_versioned"] = urljoin(urls["html_url_versioned"], "manuscript.pdf")
response = requests.head(html_url, allow_redirects=True)
if not response.ok:
logging.warning(
"html_url is not web accessible. "
f"{html_url} returned status code {response.status_code}. "
"Ignore this warning if the manuscript has not yet been deployed for the first time. "
)
if response.history:
logging.info(
"html_url includes redirects. In order of oldest to most recent:\n"
+ "\n".join(x.url for x in response.history + [response])
)
return urls
def get_software_versions(rootstock: bool = True) -> dict:
"""
Return a dictionary of software versions for softwares components:
- manubot_version: the semantic version number of the manubot python package.
- rootstock_commit: the version of the rootstock repository, as a commit hash,
included in the manuscript repository.
Values whose detection fails are set to None.
The `rootstock` parameter controls whether to fetch the rootstock commit id.
The rootstock git remote will be added to the local git repository if true.
"""
from manubot import __version__ as manubot_version
return {
"manubot_version": manubot_version,
"rootstock_commit": get_rootstock_commit() if rootstock else None,
}
def get_rootstock_commit() -> Optional[str]:
"""
Return the most recent commit in common between the git repository
this function is run within (usually a Manubot manuscript repository)
and the `main` branch of the `rootstock` remote.
WARNING: This function may modify the git repository its executed within:
- if the repository has not set the `roostock` remote, it is set to
point to the default Rootstock repository of <https://github.com/manubot/rootstock>.
- fetches the latest commits in the `main` branch of the `rootstock` remote
"""
from manubot.util import shlex_join
# add rootstock remote if remote is not already set
rootstock_remote = "https://github.com/manubot/rootstock.git"
args = ["git", "remote", "add", "rootstock", rootstock_remote]
process = subprocess.run(args, capture_output=True)
if process.returncode == 0:
logging.info(
"get_rootstock_commit added a `rootstock` remote to the git repository."
)
# find most recent common ancestor commit
try:
args = ["git", "fetch", "rootstock", "main"]
subprocess.check_output(args, stderr=subprocess.PIPE, text=True)
args = ["git", "merge-base", "HEAD", "rootstock/main"]
output = subprocess.check_output(args, stderr=subprocess.PIPE, text=True)
except subprocess.CalledProcessError as error:
logging.warning(
f"get_rootstock_commit: {shlex_join(error.cmd)!r} returned exit code {error.returncode} "
f"with the following stdout:\n{error.stdout}\n"
f"And the following stderr:\n{error.stderr}"
)
return None
rootstock_commit = output.strip()
return rootstock_commit
Functions
get_head_commit
def get_head_commit(
) -> Optional[str]
View Source
def get_head_commit() -> Optional[str]:
from manubot.util import shlex_join
args = ["git", "rev-parse", "HEAD"]
try:
return subprocess.check_output(args, stderr=subprocess.PIPE, text=True).strip()
except subprocess.CalledProcessError as error:
logging.warning(
f"get_head_commit: {shlex_join(error.cmd)!r} returned exit code {error.returncode} "
f"with the following stdout:\n{error.stdout}\n"
f"And the following stderr:\n{error.stderr}"
)
return None
get_header_includes
def get_header_includes(
variables: dict
) -> str
Render header-includes-template.html
using information from variables
.
View Source
def get_header_includes(variables: dict) -> str:
"""
Render `header-includes-template.html` using information from `variables`.
"""
from .util import template_with_jinja2
path = pathlib.Path(__file__).parent.joinpath("header-includes-template.html")
try:
template = path.read_text(encoding="utf-8-sig")
return template_with_jinja2(template, variables)
except Exception:
logging.exception("Error generating header-includes.")
return ""
get_manuscript_urls
def get_manuscript_urls(
html_url: Optional[str] = None
) -> dict
Return a dictionary with URLs for a manuscript.
An example for a manuscript where all URLs get set, inferred from continuous integration environment variables, is:
{
"html_url": "https://manubot.github.io/rootstock/",
"pdf_url": "https://manubot.github.io/rootstock/manuscript.pdf",
"html_url_versioned": "https://manubot.github.io/rootstock/v/7cf9071212ce33116ad09cf2237a370b180a3c35/",
"pdf_url_versioned": "https://manubot.github.io/rootstock/v/7cf9071212ce33116ad09cf2237a370b180a3c35/manuscript.pdf",
}
Provide html_url
to set a custom domain.
If html_url="https://git.dhimmel.com/bitcoin-whitepaper/"
,
the return dictionary will be like:
{
"html_url": "https://git.dhimmel.com/bitcoin-whitepaper/",
"pdf_url": "https://git.dhimmel.com/bitcoin-whitepaper/manuscript.pdf",
"html_url_versioned": "https://git.dhimmel.com/bitcoin-whitepaper/v/cb1f2c12eec8b56db9ef5f641ec805e2d449d319/",
"pdf_url_versioned": "https://git.dhimmel.com/bitcoin-whitepaper/v/cb1f2c12eec8b56db9ef5f641ec805e2d449d319/manuscript.pdf",
}
Note the trailing /
in html_url
, which is required for proper functioning.
View Source
def get_manuscript_urls(html_url: Optional[str] = None) -> dict:
"""
Return a dictionary with URLs for a manuscript.
An example for a manuscript where all URLs get set, inferred from continuous integration environment variables, is:
```python
{
"html_url": "https://manubot.github.io/rootstock/",
"pdf_url": "https://manubot.github.io/rootstock/manuscript.pdf",
"html_url_versioned": "https://manubot.github.io/rootstock/v/7cf9071212ce33116ad09cf2237a370b180a3c35/",
"pdf_url_versioned": "https://manubot.github.io/rootstock/v/7cf9071212ce33116ad09cf2237a370b180a3c35/manuscript.pdf",
}
```
Provide `html_url` to set a custom domain.
If `html_url="https://git.dhimmel.com/bitcoin-whitepaper/"`,
the return dictionary will be like:
```python
{
"html_url": "https://git.dhimmel.com/bitcoin-whitepaper/",
"pdf_url": "https://git.dhimmel.com/bitcoin-whitepaper/manuscript.pdf",
"html_url_versioned": "https://git.dhimmel.com/bitcoin-whitepaper/v/cb1f2c12eec8b56db9ef5f641ec805e2d449d319/",
"pdf_url_versioned": "https://git.dhimmel.com/bitcoin-whitepaper/v/cb1f2c12eec8b56db9ef5f641ec805e2d449d319/manuscript.pdf",
}
```
Note the trailing `/` in `html_url`, which is required for proper functioning.
"""
import requests
from .ci import get_continuous_integration_parameters
urls = {}
ci_params = get_continuous_integration_parameters()
if html_url is None:
if not ci_params:
return urls
html_url = "https://{repo_owner}.github.io/{repo_name}/".format(**ci_params)
urls["html_url"] = html_url
urls["pdf_url"] = urljoin(html_url, "manuscript.pdf")
if not ci_params:
return urls
urls["html_url_versioned"] = urljoin(html_url, "v/{commit}/".format(**ci_params))
urls["pdf_url_versioned"] = urljoin(urls["html_url_versioned"], "manuscript.pdf")
response = requests.head(html_url, allow_redirects=True)
if not response.ok:
logging.warning(
"html_url is not web accessible. "
f"{html_url} returned status code {response.status_code}. "
"Ignore this warning if the manuscript has not yet been deployed for the first time. "
)
if response.history:
logging.info(
"html_url includes redirects. In order of oldest to most recent:\n"
+ "\n".join(x.url for x in response.history + [response])
)
return urls
get_rootstock_commit
def get_rootstock_commit(
) -> Optional[str]
Return the most recent commit in common between the git repository
this function is run within (usually a Manubot manuscript repository)
and the main
branch of the rootstock
remote.
WARNING: This function may modify the git repository its executed within:
- if the repository has not set the
roostock
remote, it is set to point to the default Rootstock repository of https://github.com/manubot/rootstock. - fetches the latest commits in the
main
branch of therootstock
remote
View Source
def get_rootstock_commit() -> Optional[str]:
"""
Return the most recent commit in common between the git repository
this function is run within (usually a Manubot manuscript repository)
and the `main` branch of the `rootstock` remote.
WARNING: This function may modify the git repository its executed within:
- if the repository has not set the `roostock` remote, it is set to
point to the default Rootstock repository of <https://github.com/manubot/rootstock>.
- fetches the latest commits in the `main` branch of the `rootstock` remote
"""
from manubot.util import shlex_join
# add rootstock remote if remote is not already set
rootstock_remote = "https://github.com/manubot/rootstock.git"
args = ["git", "remote", "add", "rootstock", rootstock_remote]
process = subprocess.run(args, capture_output=True)
if process.returncode == 0:
logging.info(
"get_rootstock_commit added a `rootstock` remote to the git repository."
)
# find most recent common ancestor commit
try:
args = ["git", "fetch", "rootstock", "main"]
subprocess.check_output(args, stderr=subprocess.PIPE, text=True)
args = ["git", "merge-base", "HEAD", "rootstock/main"]
output = subprocess.check_output(args, stderr=subprocess.PIPE, text=True)
except subprocess.CalledProcessError as error:
logging.warning(
f"get_rootstock_commit: {shlex_join(error.cmd)!r} returned exit code {error.returncode} "
f"with the following stdout:\n{error.stdout}\n"
f"And the following stderr:\n{error.stderr}"
)
return None
rootstock_commit = output.strip()
return rootstock_commit
get_software_versions
def get_software_versions(
rootstock: bool = True
) -> dict
Return a dictionary of software versions for softwares components:
- manubot_version: the semantic version number of the manubot python package.
- rootstock_commit: the version of the rootstock repository, as a commit hash, included in the manuscript repository.
Values whose detection fails are set to None.
The rootstock
parameter controls whether to fetch the rootstock commit id.
The rootstock git remote will be added to the local git repository if true.
View Source
def get_software_versions(rootstock: bool = True) -> dict:
"""
Return a dictionary of software versions for softwares components:
- manubot_version: the semantic version number of the manubot python package.
- rootstock_commit: the version of the rootstock repository, as a commit hash,
included in the manuscript repository.
Values whose detection fails are set to None.
The `rootstock` parameter controls whether to fetch the rootstock commit id.
The rootstock git remote will be added to the local git repository if true.
"""
from manubot import __version__ as manubot_version
return {
"manubot_version": manubot_version,
"rootstock_commit": get_rootstock_commit() if rootstock else None,
}
get_thumbnail_url
def get_thumbnail_url(
thumbnail=None
)
Starting with a user-specified thumbnail
as either a path, URL, or None,
return an absolute URL pointing to the thumbnail image. If the provided thumbnail
is a URL, return this URL unmodified. If thumbnail
is None, search for thumbnail.png
within the git repository from which this function is executed. If thumbnail
is a local path, the path should be relative to root directory of the git repository
it is located in. If a local path is provided or detected,
it is converted to a GitHub raw URL.
View Source
def get_thumbnail_url(thumbnail=None):
"""
Starting with a user-specified `thumbnail` as either a path, URL, or None,
return an absolute URL pointing to the thumbnail image. If the provided `thumbnail`
is a URL, return this URL unmodified. If `thumbnail` is None, search for `thumbnail.png`
within the git repository from which this function is executed. If `thumbnail`
is a local path, the path should be relative to root directory of the git repository
it is located in. If a local path is provided or detected,
it is converted to a GitHub raw URL.
"""
from manubot.util import is_http_url
if not thumbnail:
message = "get_thumbnail_url: thumbnail location not explicitly provided. "
thumbnail = _find_thumbnail_path()
message += (
f"Thumbnail detected at {thumbnail!r}"
if thumbnail
else "No local thumbnail detected"
)
logging.debug(message)
elif is_http_url(thumbnail):
logging.debug("provided thumbnail is a URL. Pass it through.")
return thumbnail
return _thumbnail_path_to_url(thumbnail)
git_repository_root
def git_repository_root(
)
Return the path to repository root directory or None
if indeterminate.
View Source
@functools.lru_cache
def git_repository_root():
"""
Return the path to repository root directory or `None` if indeterminate.
"""
for cmd in (
["git", "rev-parse", "--show-superproject-working-tree"],
["git", "rev-parse", "--show-toplevel"],
):
try:
path = subprocess.check_output(cmd, text=True).rstrip("\r\n")
if path:
return pathlib.Path(path)
except (subprocess.CalledProcessError, OSError):
pass
return None