File indexing completed on 2025-01-18 09:11:34
0001
0002 from datetime import datetime
0003 from pathlib import Path
0004 import shutil
0005 import base64
0006 import subprocess
0007 from subprocess import run
0008 from typing import IO, List
0009 import re
0010 import textwrap
0011 import tempfile
0012 import functools
0013 import asyncio
0014
0015 import typer
0016 import toml
0017 import pydantic
0018 import rich.console
0019 import rich.panel
0020 import rich.pretty
0021 from jinja2 import Template
0022 import aiohttp
0023 from gidgethub.aiohttp import GitHubAPI
0024 from fsspec.implementations.zip import ZipFileSystem
0025
0026
0027 app = typer.Typer()
0028
0029
0030 class MetaData(pydantic.BaseModel):
0031 authors: List[str]
0032 title: str
0033 description: str = ""
0034
0035
0036 class WhitePaper(pydantic.BaseModel):
0037 repository: str
0038 slug: str
0039 pdf_url: str | None = None
0040 metadata: MetaData | None = None
0041
0042 @property
0043 def repo(self) -> str:
0044 m = re.match(r"^https://github.com/(.*/.*)$", self.repository)
0045 assert m is not None
0046 return m.group(1)
0047
0048
0049 class Config(pydantic.BaseModel):
0050 white_papers: List[WhitePaper]
0051
0052
0053 def parse_metadata(content: str) -> MetaData:
0054 m = re.search(r"\\author{(.*)}", content)
0055 assert m is not None
0056 author_text = m.group(1)
0057 authors = [a.strip() for a in author_text.split(r"\and")]
0058
0059 m = re.search(r"\\title{(.*)}", content)
0060 assert m is not None
0061 title = m.group(1).strip()
0062
0063 return MetaData(authors=authors, title=title)
0064
0065
0066 def which(cmd: str) -> Path | None:
0067 try:
0068 exe = (
0069 run(["which", cmd], check=True, capture_output=True).stdout.decode().strip()
0070 )
0071 return Path(exe)
0072 except subprocess.CalledProcessError:
0073 return None
0074
0075
0076 def coro(fn):
0077 @functools.wraps(fn)
0078 def wrapper(*args, **kwargs):
0079 return asyncio.run(fn(*args, **kwargs))
0080
0081 return wrapper
0082
0083
0084 def make_titlepage_image(convert: Path, pdf: Path, output: Path):
0085 run(
0086 [
0087 convert,
0088 "-density",
0089 "300",
0090 f"{pdf}[0]",
0091 "-background",
0092 "white",
0093 "-alpha",
0094 "remove",
0095 "-alpha",
0096 "off",
0097 "-depth",
0098 "2",
0099 str(output),
0100 ],
0101 check=True,
0102 stdout=subprocess.PIPE,
0103 stderr=subprocess.STDOUT,
0104 )
0105
0106
0107 async def get_file(gh: GitHubAPI, repo: str, file: str, ref: str | None = None) -> str:
0108 url = f"/repos/{repo}/contents/{file}"
0109 if ref is not None:
0110 url += f"?ref={ref}"
0111 r = await gh.getitem(url)
0112 return base64.b64decode(r["content"]).decode()
0113
0114
0115 class Release(pydantic.BaseModel):
0116 class Asset(pydantic.BaseModel):
0117 name: str
0118 browser_download_url: str
0119
0120 id: int
0121 created_at: datetime
0122 published_at: datetime | None
0123 tag_name: str | None
0124 assets: List[Asset]
0125
0126
0127 class Artifact(pydantic.BaseModel):
0128 class WorkflowRun(pydantic.BaseModel):
0129 head_branch: str
0130 head_sha: str
0131
0132 updated_at: datetime
0133 workflow_run: WorkflowRun
0134 archive_download_url: str
0135
0136
0137 async def get_latest_release(gh: GitHubAPI, repo: str) -> Release | None:
0138 latest_release = None
0139
0140 async for release in gh.getiter(f"/repos/{repo}/releases"):
0141 release = Release(**release)
0142 if release.published_at is None:
0143 continue
0144 if latest_release is None:
0145 latest_release = release
0146 continue
0147
0148 assert latest_release.published_at is not None
0149 if release.published_at > latest_release.published_at:
0150 latest_release = release
0151
0152 return latest_release
0153
0154
0155 async def get_latest_artifact(
0156 gh: GitHubAPI, repo: str, branch: str = "main"
0157 ) -> Artifact | None:
0158 latest_artifact = None
0159 async for artifact in gh.getiter(
0160 f"/repos/{repo}/actions/artifacts", iterable_key="artifacts"
0161 ):
0162 artifact = Artifact(**artifact)
0163 if artifact.workflow_run.head_branch != branch:
0164 continue
0165 if latest_artifact is None:
0166 latest_artifact = artifact
0167 continue
0168
0169 if artifact.updated_at > latest_artifact.updated_at:
0170 latest_artifact = artifact
0171
0172 return latest_artifact
0173
0174
0175 async def download_file(
0176 session: aiohttp.ClientSession, url: str, target: IO[bytes], *args, **kwargs
0177 ):
0178 r = await session.get(url, *args, **kwargs)
0179
0180 async for data, _ in r.content.iter_chunks():
0181 target.write(data)
0182 target.seek(0)
0183
0184
0185 def extract_pdf_from_artifact(fh: IO[bytes], target: IO[bytes]):
0186 zipfs = ZipFileSystem(fh)
0187 files = zipfs.ls("/", detail=False)
0188 if len(files) != 1:
0189 print("Unexpected number of files in artifact", files)
0190 raise typer.Exit(1)
0191 if not files[0].endswith(".pdf"):
0192 print("Unexpected file in artifact", files)
0193 raise typer.Exit(1)
0194
0195 shutil.copyfileobj(zipfs.open(files[0]), target)
0196
0197
0198 @app.command()
0199 @coro
0200 async def pull(
0201 config_file: Path = Path(__file__).parent / "white_papers.toml",
0202 github_token: str = typer.Option(..., envvar="GITHUB_TOKEN"),
0203 ):
0204 console = rich.console.Console()
0205 convert = which("convert")
0206 if convert is None:
0207 print("convert (imagemagick) not found, please install")
0208 raise typer.Exit(1)
0209
0210 async with aiohttp.ClientSession() as session:
0211 gh = GitHubAPI(session, "requester", oauth_token=github_token)
0212
0213 with console.status("[bold green]Loading config...") as status:
0214 config = Config(**toml.load(config_file))
0215
0216 for whp in config.white_papers:
0217 status.update(f"Loading {whp.repository}...")
0218
0219 repo = whp.repo
0220 slug = whp.slug
0221
0222 image_path = Path(__file__).parent / "white_papers" / "figures"
0223 assert image_path.is_dir(), image_path
0224
0225 title_page_file = image_path / f"{slug}.png"
0226
0227 latest_release = await get_latest_release(gh, repo)
0228
0229 metadata_ref = None
0230
0231 if latest_release is not None:
0232 for asset in latest_release.assets:
0233 if asset.name.endswith(".pdf"):
0234 whp.pdf_url = asset.browser_download_url
0235 metadata_ref = latest_release.tag_name
0236 break
0237
0238 if whp.pdf_url is not None:
0239 with tempfile.NamedTemporaryFile(suffix=".pdf") as pdf_fh:
0240 status.update(f"\[{repo}] Downloading PDF...")
0241 await download_file(
0242 session,
0243 whp.pdf_url,
0244 pdf_fh,
0245 headers={"Authorization": f"Token {github_token}"},
0246 )
0247 status.update(f"\[{repo}] Converting PDF to PNG...")
0248 make_titlepage_image(
0249 convert, Path(pdf_fh.name), title_page_file
0250 )
0251
0252 else:
0253 latest_artifact = await get_latest_artifact(gh, repo)
0254
0255 if latest_artifact is None:
0256 print("No artifacts found for", whp.repository)
0257 raise typer.Exit(1)
0258
0259 metadata_ref = latest_artifact.workflow_run.head_sha
0260
0261 status.update(f"\[{repo}] Downloading artifact...")
0262
0263 with tempfile.TemporaryFile() as fh, tempfile.NamedTemporaryFile(
0264 suffix=".pdf"
0265 ) as pdf_fh:
0266 await download_file(
0267 session,
0268 latest_artifact.archive_download_url,
0269 fh,
0270 headers={"Authorization": f"Token {github_token}"},
0271 )
0272
0273 extract_pdf_from_artifact(fh, pdf_fh)
0274
0275 status.update(f"\[{repo}] Converting PDF to PNG...")
0276 make_titlepage_image(
0277 convert, Path(pdf_fh.name), title_page_file
0278 )
0279
0280 status.update(f"\[{repo}] Getting metadata for ref {metadata_ref}...")
0281
0282 metadata = await get_file(gh, repo, "metadata.tex", ref=metadata_ref)
0283 abstract = await get_file(gh, repo, "abstract.tex", ref=metadata_ref)
0284
0285 abstract = textwrap.dedent(abstract).strip()
0286 abstract = "\n".join(
0287 [
0288 line
0289 for line in abstract.split("\n")
0290 if not line.strip().startswith("%")
0291 ]
0292 ).strip()
0293 metadata = parse_metadata(metadata)
0294 metadata.description = abstract
0295
0296 console.print(
0297 rich.panel.Panel(rich.pretty.Pretty(metadata), title=whp.repository)
0298 )
0299
0300 whp.metadata = metadata
0301
0302 status.update("Updating config...")
0303
0304 with open(config_file, "w") as f:
0305 toml.dump(config.model_dump(), f)
0306
0307
0308 @app.command()
0309 def render(config_file: Path = Path(__file__).parent / "white_papers.toml"):
0310 config = Config(**toml.load(config_file))
0311 docs_path = Path(__file__).parent / "white_papers"
0312 assert docs_path.is_dir(), docs_path
0313 index_target_file = docs_path / "index.md"
0314
0315 template_file = Path(__file__).parent / "white_paper_template.md.j2"
0316 index_template_file = Path(__file__).parent / "white_paper_index_template.md.j2"
0317
0318 tpl = Template(template_file.read_text())
0319 index_tpl = Template(index_template_file.read_text())
0320
0321 index_target_file.write_text(index_tpl.render(config=config))
0322 print("Index written to", index_target_file)
0323
0324 for whp in config.white_papers:
0325 assert whp.metadata is not None, "White paper meta data is missing"
0326 target_file = docs_path / f"{whp.slug}.md"
0327 target_file.write_text(tpl.render(whp=whp, image_path="figures"))
0328 print("-", whp.metadata.title, "->", target_file)
0329
0330
0331 if "__main__" == __name__:
0332 app()