docs: add docs for lib

zhudotexe · Feb 13, 2024 · 4812790 · 4812790
1 parent f91c053
commit 4812790
Show file tree

Hide file tree

Showing 10 changed files with 71 additions and 115 deletions.
diff --git a/README.md b/README.md
@@ -1,3 +1,5 @@
+# FanOutQA
+
 <p align="center">
   <a href="https://fanoutqa.readthedocs.io/en/latest/?badge=latest">
     <img alt="Documentation Status" src="https://readthedocs.org/projects/fanoutqa/badge/?version=latest">
@@ -7,9 +9,7 @@
   </a>
 </p>
 
-# FanOutQA
-
-Read the paper! | [Download the dataset!](/fanoutqa/data)
+Read the paper! | [Download the dataset!](https://github.com/zhudotexe/fanoutqa/tree/main/fanoutqa/data)
 
 FanOutQA is a high quality, multi-hop, multi-document benchmark for large language models using English Wikipedia as its
 knowledge base. Compared to other question-answering benchmarks, FanOutQA requires reasoning over a greater number of

diff --git a/docs/api_reference.rst b/docs/api_reference.rst
@@ -0,0 +1,29 @@
+API Reference
+=============
+
+Main Entrypoints
+----------------
+.. autofunction:: fanoutqa.load_dev
+
+.. autofunction:: fanoutqa.load_test
+
+.. autofunction:: fanoutqa.eval.evaluate
+
+Wikipedia Retrieval
+-------------------
+.. autofunction:: fanoutqa.wiki_search
+
+.. autofunction:: fanoutqa.wiki_content
+
+Models
+------
+.. automodule:: fanoutqa.models
+    :members:
+
+.. automodule:: fanoutqa.eval.models
+    :members:
+
+Baseline Retriever
+------------------
+.. automodule:: fanoutqa.retrieval
+    :members:
diff --git a/docs/conf.py b/docs/conf.py
@@ -30,6 +30,7 @@
     "sphinx_copybutton",  # https://sphinx-copybutton.readthedocs.io/en/latest/
     "sphinxemoji.sphinxemoji",  # https://sphinxemojicodes.readthedocs.io/en/stable/
     "sphinx_sitemap",  # https://sphinx-sitemap.readthedocs.io/en/latest/getting-started.html
+    "myst_parser",  # https://myst-parser.readthedocs.io/en/stable/intro.html
 ]
 
 templates_path = ["_templates"]
@@ -45,7 +46,7 @@
 html_extra_path = ["_extra"]
 # html_logo = "_static/[email protected]"
 # html_favicon = "_extra/favicon.ico"
-# html_baseurl = "https://kani.readthedocs.io/en/latest/"
+html_baseurl = "https://fanoutqa.com/en/latest/"
 
 nitpicky = True
 nitpick_ignore_regex = [

diff --git a/docs/index.rst b/docs/index.rst
@@ -1,92 +1,8 @@
-kani (カニ)
-===========
-
-kani (カニ) is a lightweight and highly hackable framework for chat-based language models with tool usage/function
-calling.
-
-Compared to other LM frameworks, kani is less opinionated and offers more fine-grained customizability
-over the parts of the control flow that matter, making it the perfect choice for NLP researchers, hobbyists, and
-developers alike.
-
-kani comes with support for OpenAI models and LLaMA v2 out of the box, with a model-agnostic framework to add support
-for many more.
-
-Features
---------
-
-- **Lightweight and high-level** - kani implements common boilerplate to interface with language models without forcing
-  you to use opinionated prompt frameworks or complex library-specific tooling.
-- **Model agnostic** - kani provides a simple interface to implement: token counting and completion generation.
-  Implement these two, and kani can run with any language model.
-- **Automatic chat memory management** - Allow chat sessions to flow without worrying about managing the number of
-  tokens in the history - kani takes care of it.
-- **Function calling with model feedback and retry** - Give models access to functions in just one line of code.
-  kani elegantly provides feedback about hallucinated parameters and errors and allows the model to retry calls.
-- **You control the prompts** - There are no hidden prompt hacks. We will never decide for you how to format your own
-  data, unlike other popular language model libraries.
-- **Fast to iterate and intuitive to learn** - With kani, you only write Python - we handle the rest.
-- **Asynchronous design from the start** - kani can scale to run multiple chat sessions in parallel easily, without
-  having to manage multiple processes or programs.
-
-Quickstart
-----------
-kani requires Python 3.10 or above.
-
-First, install the library. In this quickstart, we'll use the OpenAI engine, though kani is model-agnostic.
-
-.. code-block:: console
-
-    $ pip install "kani[openai]"
-
-Then, let's use kani to create a simple chatbot using ChatGPT as a backend.
-
-.. code-block:: python
-
-    # import the library
-    from kani import Kani, chat_in_terminal
-    from kani.engines.openai import OpenAIEngine
-
-    # Replace this with your OpenAI API key: https://platform.openai.com/account/api-keys
-    api_key = "sk-..."
-
-    # kani uses an Engine to interact with the language model. You can specify other model
-    # parameters here, like temperature=0.7.
-    engine = OpenAIEngine(api_key, model="gpt-3.5-turbo")
-
-    # The kani manages the chat state, prompting, and function calling. Here, we only give
-    # it the engine to call ChatGPT, but you can specify other parameters like system_prompt="You are..." here.
-    ai = Kani(engine)
-
-    # kani comes with a utility to interact with a kani through your terminal! Check out
-    # the docs for how to use kani programmatically.
-    chat_in_terminal(ai)
-
-kani makes the time to set up a working chat model short, while offering the programmer deep customizability over
-every prompt, function call, and even the underlying language model.
-
-To learn more about how to customize kani with your own prompt wrappers, function calling, and more, read on!
-
-Hands-on examples are available in the `kani repository <https://github.com/zhudotexe/kani/tree/main/examples>`_.
+.. include:: ../README.md
+    :parser: myst_parser.sphinx_
 
 .. toctree::
     :maxdepth: 2
-    :caption: Docs
 
-    install
-    kani
-    function_calling
-    customization
-    engines
-    advanced
     api_reference
-    engine_reference
-    genindex
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Community
-
-    community/contributing
-    community/extensions
-    community/showcase
-    Discord <https://discord.gg/eTepTNDxYT>
+    genindex
diff --git a/fanoutqa/eval/models.py b/fanoutqa/eval/models.py
@@ -34,5 +34,6 @@ class EvaluationScore:
 
 
 class Answer(TypedDict):
+    """A dictionary of the form ``{"id": "...", "answer": "..."}``."""
     id: str
     answer: str
diff --git a/fanoutqa/eval/scorer.py b/fanoutqa/eval/scorer.py
@@ -12,7 +12,7 @@
 from fanoutqa.eval.string import answer_in_text
 from fanoutqa.eval.utils import str_answer
 from fanoutqa.models import DevQuestion
-from fanoutqa.utils import batched, copy_doc
+from fanoutqa.utils import batched
 
 ROUGE_TYPES = ("rouge1", "rouge2", "rougeL")
 
@@ -167,7 +167,16 @@ async def score_gpt(self):
         return avg_acc
 
 
-@copy_doc(Scorer.__init__)
 def evaluate(questions: list[DevQuestion], answers: list[Answer], **kwargs) -> EvaluationScore:
+    """
+    Evaluate all FOQA metrics across the given questions and generated answers.
+
+    :param questions: The questions and reference answers, as loaded by the dataset.
+    :param answers: The generated answers to score. These should be dictionaries like ``{"id": "...", "answer": "..."}``
+    :param only_score_answered: Whether to only score questions that have an answer (True), or consider unanswered
+        questions to have 0 score (False, default). This is useful for evaluating only a subset of the dataset.
+    :param llm_cache_key: If this is provided, cache the LLM-as-judge generations with this key. We recommend
+        setting this to a human-readable key for each system under test.
+    """
     scorer = Scorer(questions, answers, **kwargs)
     return asyncio.run(scorer.score())
diff --git a/fanoutqa/models.py b/fanoutqa/models.py
@@ -10,16 +10,16 @@ class Evidence:
     """A reference to a Wikipedia article at a given point in time."""
 
     pageid: int
-    """Wikipedia page ID"""
+    """Wikipedia page ID."""
 
     revid: int
-    """Wikipedia revision ID of page as of dataset epoch"""
+    """Wikipedia revision ID of page as of dataset epoch. Often referred to as ``oldid`` in Wikipedia API docs."""
 
     title: str
-    """Title of page"""
+    """Title of page."""
 
     url: str
-    """Link to page"""
+    """Link to page."""
 
     @classmethod
     def from_dict(cls, d):
@@ -31,16 +31,18 @@ class DevSubquestion:
     """A human-written decomposition of a top-level question."""
 
     id: str
+    """The ID of the question."""
     question: str
+    """The question for the system to answer."""
     decomposition: list["DevSubquestion"]
+    """A human-written decomposition of the question."""
     answer: AnswerType
-    """the answer to this subquestion"""
-
+    """The human-written reference answer to this subquestion."""
     depends_on: list[str]
-    """the IDs of subquestions that this subquestion requires answering first"""
-
+    """The IDs of subquestions that this subquestion requires answering first."""
     evidence: Optional[Evidence]
-    """if this is None, the question will have a decomposition"""
+    """The Wikipedia page used by the human annotator to answer this question.
+    If this is None, the question will have a decomposition."""
 
     @classmethod
     def from_dict(cls, d):
@@ -61,11 +63,13 @@ class DevQuestion:
     """A top-level question in the FOQA dataset and its decomposition."""
 
     id: str
+    """The ID of the question."""
     question: str
-    """the top-level question to answer"""
+    """The top-level question for the system to answer."""
     decomposition: list[DevSubquestion]
-    """human-written decomposition of the question"""
+    """A human-written decomposition of the question."""
     answer: AnswerType
+    """A human-written reference answer to the question."""
     categories: list[str]
 
     @classmethod
@@ -97,8 +101,11 @@ class TestQuestion:
     """A top-level question in the FOQA dataset, without its decomposition or answer."""
 
     id: str
+    """The ID of the question."""
     question: str
+    """The top-level question for the system to answer."""
     necessary_evidence: list[Evidence]
+    """A list of all the evidence used by human annotators to answer the question."""
     categories: list[str]
 
     @classmethod

diff --git a/fanoutqa/retrieval.py b/fanoutqa/retrieval.py
@@ -26,7 +26,8 @@ class RetrievalResult:
 
 
 class Corpus:
-    """A corpus of wiki docs. Indexes the docs on creation, normalizing the text beforehand with lemmatization.
+    """
+    A corpus of wiki docs. Indexes the docs on creation, normalizing the text beforehand with lemmatization.
 
     Splits the documents into chunks no longer than a given length, preferring splitting on paragraph and sentence
     boundaries. Documents will be converted to Markdown.
@@ -42,7 +43,7 @@ class Corpus:
         corpus = fanoutqa.retrieval.Corpus(q.necessary_evidence)
         for fragment in corpus.best(q.question):
             # use your own structured prompt format here
-            prompt += f"# {fragment.title}\n{fragment.content}\n\n"
+            prompt += f"# {fragment.title}\\n{fragment.content}\\n\\n"
     """
 
     def __init__(self, documents: list[Evidence], doc_len: int = 2048):

diff --git a/fanoutqa/utils.py b/fanoutqa/utils.py
@@ -52,15 +52,6 @@ def batched(iterable, n):
         yield batch
 
 
-def copy_doc(src):
-    """A simple wrapper to copy the docstring of a source function to the decorated function."""
-
-    def wrapper(f):
-        f.__doc__ = src.__doc__
-
-    return wrapper
-
-
 # markdown
 # We make some minor adjustments to markdownify's default style to make it look a little bit nicer
 def discard(*_):

diff --git a/requirements.txt b/requirements.txt
@@ -17,3 +17,4 @@ sphinx-inline-tabs~=2023.4.21
 sphinx-sitemap~=2.5.1
 sphinxext-opengraph~=0.9.0
 matplotlib>=3.0.0,<4.0.0  # depended on by opengraph
+myst-parser~=2.0.0