anything-question-answering

Runtime error

App Files Files Community

LOUIS SANNA commited on Oct 27, 2023

Commit

780c913

1 Parent(s): 3a575de

feat(domains)

Browse files

Files changed (3) hide show

anyqa/config.py +10 -0
anyqa/retriever.py +7 -8
app.py +11 -12

anyqa/config.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+def get_domains():
+    domains = []
+    for root, dirs, files in os.walk("data"):
+        for dir in dirs:
+            domains.append(dir)
+    return domains

anyqa/retriever.py CHANGED Viewed

@@ -13,25 +13,24 @@ SUMMARY_TYPES = []
 class QARetriever(BaseRetriever):
     vectorstore: VectorStore
-    sources: list = []
     threshold: float = 22
     k_summary: int = 0
     k_total: int = 10
     namespace: str = "vectors"
     def get_relevant_documents(self, query: str) -> List[Document]:
-        # Check if all elements in the list are either IPCC or IPBES
-        assert isinstance(self.sources, list)
         assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
-        query = "He who can bear the misfortune of a nation is called the ruler of the world."
         # Prepare base search kwargs
         filters = {}
-        if len(self.sources):
-            filters["source"] = {"$in": self.sources}
         if self.k_summary > 0:
             # Search for k_summary documents in the summaries dataset
             if len(SUMMARY_TYPES):
                 filters_summaries = {
                     **filters_summaries,
@@ -48,7 +47,8 @@ class QARetriever(BaseRetriever):
             docs_summaries = []
         # Search for k_total - k_summary documents in the full reports dataset
-        filters_full = {}
         if len(SUMMARY_TYPES):
             filters_full = {**filters_full, "report_type": {"$nin": SUMMARY_TYPES}}
@@ -59,7 +59,6 @@ class QARetriever(BaseRetriever):
             filter=self.format_filter(filters_full),
             k=k_full,
         )
-        print("docs_full", docs_full)
         # Concatenate documents
         docs = docs_summaries + docs_full

 class QARetriever(BaseRetriever):
     vectorstore: VectorStore
+    domains: list = []
     threshold: float = 22
     k_summary: int = 0
     k_total: int = 10
     namespace: str = "vectors"
     def get_relevant_documents(self, query: str) -> List[Document]:
+        assert isinstance(self.domains, list)
         assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
         # Prepare base search kwargs
         filters = {}
+        if len(self.domains):
+            filters["domain"] = {"$in": self.domains}
         if self.k_summary > 0:
             # Search for k_summary documents in the summaries dataset
+            filters_summaries = {**filters}
             if len(SUMMARY_TYPES):
                 filters_summaries = {
                     **filters_summaries,
             docs_summaries = []
         # Search for k_total - k_summary documents in the full reports dataset
+        filters_full = {**filters}
+        print("filters", filters)
         if len(SUMMARY_TYPES):
             filters_full = {**filters_full, "report_type": {"$nin": SUMMARY_TYPES}}
             filter=self.format_filter(filters_full),
             k=k_full,
         )
         # Concatenate documents
         docs = docs_summaries + docs_full

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 # ClimateQ&A imports
 from anyqa.embeddings import EMBEDDING_MODEL_NAME
 from anyqa.llm import get_llm
 from anyqa.qa_logging import log
@@ -136,16 +137,14 @@ def answer_user_example(query, query_example, history):
     return query_example, history + [[query_example, ". . ."]]
-def fetch_sources(query, sources):
-    # Prepare default values
-    if len(sources) == 0:
-        sources = ["IPCC"]
     llm_reformulation = get_llm(
         max_tokens=512, temperature=0.0, verbose=True, streaming=False
     )
     retriever = QARetriever(
-        vectorstore=vectorstore, sources=[], k_summary=0, k_total=10
     )
     reformulation_chain = load_reformulation_chain(llm_reformulation)
@@ -379,11 +378,11 @@ with gr.Blocks(title="❓ Q&A", css="style.css", theme=theme) as demo:
                         gr.Markdown(
                             "Reminder: You can talk in any language, this tool is multi-lingual!"
                         )
-                        dropdown_sources = gr.CheckboxGroup(
-                            ["IPCC", "IPBES"],
-                            label="Select reports",
-                            value=["IPCC"],
                             interactive=True,
                         )
@@ -419,7 +418,7 @@ with gr.Blocks(title="❓ Q&A", css="style.css", theme=theme) as demo:
                     .success(change_tab, None, tabs)
                     .success(
                         fetch_sources,
-                        [textbox, dropdown_sources],
                         [
                             textbox,
                             sources_textbox,
@@ -454,7 +453,7 @@ with gr.Blocks(title="❓ Q&A", css="style.css", theme=theme) as demo:
                     .success(change_tab, None, tabs)
                     .success(
                         fetch_sources,
-                        [textbox, dropdown_sources],
                         [
                             textbox,
                             sources_textbox,

 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 # ClimateQ&A imports
+from anyqa.config import get_domains
 from anyqa.embeddings import EMBEDDING_MODEL_NAME
 from anyqa.llm import get_llm
 from anyqa.qa_logging import log
     return query_example, history + [[query_example, ". . ."]]
+def fetch_sources(query, domains):
     llm_reformulation = get_llm(
         max_tokens=512, temperature=0.0, verbose=True, streaming=False
     )
+    print("domains", domains)
     retriever = QARetriever(
+        vectorstore=vectorstore, domains=domains, k_summary=0, k_total=10
     )
     reformulation_chain = load_reformulation_chain(llm_reformulation)
                         gr.Markdown(
                             "Reminder: You can talk in any language, this tool is multi-lingual!"
                         )
+                        domains = get_domains()
+                        dropdown_domains = gr.CheckboxGroup(
+                            domains,
+                            label="Select source types",
+                            value=[],
                             interactive=True,
                         )
                     .success(change_tab, None, tabs)
                     .success(
                         fetch_sources,
+                        [textbox, dropdown_domains],
                         [
                             textbox,
                             sources_textbox,
                     .success(change_tab, None, tabs)
                     .success(
                         fetch_sources,
+                        [textbox, dropdown_domains],
                         [
                             textbox,
                             sources_textbox,