Skip to content

Commit

Permalink
docs: sorted by downloads [wip] (#28869)
Browse files Browse the repository at this point in the history
  • Loading branch information
efriis authored Dec 23, 2024
1 parent 6352edf commit 3726a94
Show file tree
Hide file tree
Showing 3 changed files with 456 additions and 251 deletions.
71 changes: 71 additions & 0 deletions docs/scripts/packages_yml_get_downloads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from datetime import datetime, timedelta, timezone
from pathlib import Path

import requests
from ruamel.yaml import YAML
from ruamel.yaml.comments import CommentedMap

yaml = YAML()

PACKAGE_YML = Path(__file__).parents[2] / "libs" / "packages.yml"


def _get_downloads(p: dict) -> int:
url = f"https://pypistats.org/api/packages/{p['name']}/recent?period=month"
r = requests.get(url)
r.raise_for_status()
return r.json()["data"]["last_month"]


current_datetime = datetime.now(timezone.utc)
yesterday = current_datetime - timedelta(days=1)

with open(PACKAGE_YML) as f:
data = yaml.load(f)


def _reorder_keys(p):
keys = p.keys()
key_order = [
"name",
"name_title",
"path",
"repo",
"type",
"provider_page",
"js",
"downloads",
"downloads_updated_at",
]
if set(keys) - set(key_order):
raise ValueError(f"Unexpected keys: {set(keys) - set(key_order)}")
return CommentedMap((k, p[k]) for k in key_order if k in p)


data["packages"] = [_reorder_keys(p) for p in data["packages"]]

seen = set()
for p in data["packages"]:
if p["name"] in seen:
raise ValueError(f"Duplicate package: {p['name']}")
seen.add(p["name"])
downloads_updated_at_str = p.get("downloads_updated_at")
downloads_updated_at = (
datetime.fromisoformat(downloads_updated_at_str)
if downloads_updated_at_str
else None
)

if downloads_updated_at is not None and downloads_updated_at > yesterday:
print(f"done: {p['name']}: {p['downloads']}")
continue

p["downloads"] = _get_downloads(p)
p["downloads_updated_at"] = current_datetime.isoformat()
with open(PACKAGE_YML, "w") as f:
yaml.dump(data, f)
print(f"{p['name']}: {p['downloads']}")


with open(PACKAGE_YML, "w") as f:
yaml.dump(data, f)
160 changes: 78 additions & 82 deletions docs/scripts/partner_pkg_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,110 +2,106 @@
import sys
from pathlib import Path

import requests
import yaml

#################
# CONFIGURATION #
#################

# packages to ignore / exclude from the table
IGNORE_PACKGAGES = {
# top-level packages
"langchain-core",
"langchain-text-splitters",
"langchain",
"langchain-community",
"langchain-experimental",
"langchain-cli",
"langchain-tests",
# integration packages that don't have a provider index
# do NOT add to these. These were merged before having a
# provider index was required
# can remove these once they have a provider index
"langchain-yt-dlp",
}

#####################
# END CONFIGURATION #
#####################

DOCS_DIR = Path(__file__).parents[1]
PACKAGE_YML = Path(__file__).parents[2] / "libs" / "packages.yml"
IGNORE_PACKGAGES = {"langchain-experimental"}

# for now, only include packages that are in the langchain-ai org
# because we don't have a policy for inclusion in this table yet,
# and including all packages will make the list too long
with open(PACKAGE_YML) as f:
data = yaml.safe_load(f)
EXTERNAL_PACKAGES = set(
p["name"][10:]
for p in data["packages"]
if p["repo"].startswith("langchain-ai/")
and p["repo"] != "langchain-ai/langchain"
and p["name"] not in IGNORE_PACKGAGES


def _get_type(package: dict) -> str:
if package["name"] in IGNORE_PACKGAGES:
return "ignore"
if package["repo"] == "langchain-ai/langchain":
return "B"
if package["repo"].startswith("langchain-ai/"):
return "C"
return "D"


def _enrich_package(p: dict) -> dict | None:
p["name_short"] = (
p["name"][10:] if p["name"].startswith("langchain-") else p["name"]
)
p["name_title"] = p.get("name_title") or p["name_short"].title().replace(
"-", " "
).replace("db", "DB").replace("Db", "DB").replace("ai", "AI").replace("Ai", "AI")
p["type"] = _get_type(p)

if p["type"] == "ignore":
return None

p["js_exists"] = bool(p.get("js"))
custom_provider_page = p.get("provider_page")
default_provider_page = f"/docs/integrations/providers/{p['name_short']}/"
default_provider_page_exists = bool(
glob.glob(str(DOCS_DIR / f"docs/integrations/providers/{p['name_short']}.*"))
)
IN_REPO_PACKAGES = set(
p["name"][10:]
for p in data["packages"]
if p["repo"] == "langchain-ai/langchain"
and p["path"].startswith("libs/partners")
and p["name"] not in IGNORE_PACKGAGES
p["provider_page"] = custom_provider_page or (
default_provider_page if default_provider_page_exists else None
)
if p["provider_page"] is None:
msg = (
f"Provider page not found for {p['name_short']}. "
f"Please add one at docs/integrations/providers/{p['name_short']}.{{mdx,ipynb}}"
)
raise ValueError(msg)

JS_PACKAGES = {
"google-gauth",
"openai",
"anthropic",
"google-genai",
"pinecone",
"aws",
"google-vertexai",
"qdrant",
"azure-dynamic-sessions",
"google-vertexai-web",
"redis",
"azure-openai",
"google-webauth",
"baidu-qianfan",
"groq",
"standard-tests",
"cloudflare",
"mistralai",
"textsplitters",
"cohere",
"mixedbread-ai",
"weaviate",
"mongodb",
"yandex",
"exa",
"nomic",
"google-common",
"ollama",
"ibm",
}
return p

ALL_PACKAGES = IN_REPO_PACKAGES.union(EXTERNAL_PACKAGES)

CUSTOM_NAME = {
"google-genai": "Google Generative AI",
"aws": "AWS",
"ibm": "IBM",
}
CUSTOM_PROVIDER_PAGES = {
"azure-dynamic-sessions": "/docs/integrations/providers/microsoft/",
"prompty": "/docs/integrations/providers/microsoft/",
"sqlserver": "/docs/integrations/providers/microsoft/",
"google-community": "/docs/integrations/providers/google/",
"google-genai": "/docs/integrations/providers/google/",
"google-vertexai": "/docs/integrations/providers/google/",
"nvidia-ai-endpoints": "/docs/integrations/providers/nvidia/",
"exa": "/docs/integrations/providers/exa_search/",
"mongodb": "/docs/integrations/providers/mongodb_atlas/",
"sema4": "/docs/integrations/providers/robocorp/",
"postgres": "/docs/integrations/providers/pgvector/",
}
PROVIDER_PAGES = {
name: f"/docs/integrations/providers/{name}/"
for name in ALL_PACKAGES
if glob.glob(str(DOCS_DIR / f"docs/integrations/providers/{name}.*"))
}
PROVIDER_PAGES = {
**PROVIDER_PAGES,
**CUSTOM_PROVIDER_PAGES,
}
with open(PACKAGE_YML) as f:
data = yaml.safe_load(f)

packages_n = [_enrich_package(p) for p in data["packages"]]
packages = [p for p in packages_n if p is not None]

# sort by downloads
packages_sorted = sorted(packages, key=lambda p: p["downloads"], reverse=True)


def package_row(name: str) -> str:
js = "✅" if name in JS_PACKAGES else "❌"
link = PROVIDER_PAGES.get(name)
title = CUSTOM_NAME.get(name) or name.title().replace("-", " ").replace(
"db", "DB"
).replace("Db", "DB").replace("ai", "AI").replace("Ai", "AI")
def package_row(p: dict) -> str:
js = "✅" if p["js_exists"] else "❌"
link = p["provider_page"]
title = p["name_title"]
provider = f"[{title}]({link})" if link else title
return f"| {provider} | [langchain-{name}](https://python.langchain.com/api_reference/{name.replace('-', '_')}/) | ![PyPI - Downloads](https://img.shields.io/pypi/dm/langchain-{name}?style=flat-square&label=%20&color=blue) | ![PyPI - Version](https://img.shields.io/pypi/v/langchain-{name}?style=flat-square&label=%20&color=orange) | {js} |"
return f"| {provider} | [{p['name']}](https://python.langchain.com/api_reference/{p['name_short'].replace('-', '_')}/) | ![PyPI - Downloads](https://img.shields.io/pypi/dm/{p['name']}?style=flat-square&label=%20&color=blue) | ![PyPI - Version](https://img.shields.io/pypi/v/{p['name']}?style=flat-square&label=%20&color=orange) | {js} |"


def table() -> str:
header = """| Provider | Package | Downloads | Latest | [JS](https://js.langchain.com/docs/integrations/providers/) |
| :--- | :---: | :---: | :---: | :---: |
"""
return header + "\n".join(package_row(name) for name in sorted(ALL_PACKAGES))
return header + "\n".join(package_row(p) for p in packages_sorted)


def doc() -> str:
Expand Down
Loading

0 comments on commit 3726a94

Please sign in to comment.