diff --git a/.semversioner/next-release/minor-20241227205339264730.json b/.semversioner/next-release/minor-20241227205339264730.json new file mode 100644 index 0000000000..ecabc4316a --- /dev/null +++ b/.semversioner/next-release/minor-20241227205339264730.json @@ -0,0 +1,4 @@ +{ + "type": "minor", + "description": "new search implemented as a new option for the api" +} diff --git a/.semversioner/next-release/minor-20241231213627966329.json b/.semversioner/next-release/minor-20241231213627966329.json new file mode 100644 index 0000000000..93dbd4f4a0 --- /dev/null +++ b/.semversioner/next-release/minor-20241231213627966329.json @@ -0,0 +1,4 @@ +{ + "type": "minor", + "description": "Remove old pipeline runner." +} diff --git a/.semversioner/next-release/minor-20241231214323349946.json b/.semversioner/next-release/minor-20241231214323349946.json new file mode 100644 index 0000000000..a62cae7b78 --- /dev/null +++ b/.semversioner/next-release/minor-20241231214323349946.json @@ -0,0 +1,4 @@ +{ + "type": "minor", + "description": "Remove DataShaper (first steps)." +} diff --git a/.semversioner/next-release/patch-20241230224307150194.json b/.semversioner/next-release/patch-20241230224307150194.json new file mode 100644 index 0000000000..f11788103c --- /dev/null +++ b/.semversioner/next-release/patch-20241230224307150194.json @@ -0,0 +1,4 @@ +{ + "type": "minor", + "description": "Make gleanings independent of encoding" +} diff --git a/.semversioner/next-release/patch-20250102170720512799.json b/.semversioner/next-release/patch-20250102170720512799.json new file mode 100644 index 0000000000..558059249a --- /dev/null +++ b/.semversioner/next-release/patch-20250102170720512799.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Remove config input models." +} diff --git a/.semversioner/next-release/patch-20250102232542899735.json b/.semversioner/next-release/patch-20250102232542899735.json new file mode 100644 index 0000000000..8b8019cf05 --- /dev/null +++ b/.semversioner/next-release/patch-20250102232542899735.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Ruff update" +} diff --git a/dictionary.txt b/dictionary.txt index 575fe55548..02851fcd7b 100644 --- a/dictionary.txt +++ b/dictionary.txt @@ -148,10 +148,6 @@ codebases # Microsoft MSRC -# Broken Upstream -# TODO FIX IN DATASHAPER -Arrary - # Prompt Inputs ABILA Abila diff --git a/docs/blog_posts.md b/docs/blog_posts.md index 2ff64abcea..cc608517bb 100644 --- a/docs/blog_posts.md +++ b/docs/blog_posts.md @@ -44,4 +44,11 @@
Published November 25, 2024 By [Darren Edge](https://www.microsoft.com/en-us/research/people/daedge/), Senior Director; [Ha Trinh](https://www.microsoft.com/en-us/research/people/trinhha/), Senior Data Scientist; [Jonathan Larson](https://www.microsoft.com/en-us/research/people/jolarso/), Senior Principal Data Architect
+ +- [:octicons-arrow-right-24: __Moving to GraphRAG 1.0 – Streamlining ergonomics for developers and users__](https://www.microsoft.com/en-us/research/blog/moving-to-graphrag-1-0-streamlining-ergonomics-for-developers-and-users) + + --- +
Published December 16, 2024 + + By [Nathan Evans](https://www.microsoft.com/en-us/research/people/naevans/), Principal Software Architect; [Alonso Guevara Fernández](https://www.microsoft.com/en-us/research/people/alonsog/), Senior Software Engineer; [Joshua Bradley](https://www.microsoft.com/en-us/research/people/joshbradley/), Senior Data Scientist
\ No newline at end of file diff --git a/docs/examples_notebooks/index_migration.ipynb b/docs/examples_notebooks/index_migration.ipynb index a0ba6ae471..5021fa2cbb 100644 --- a/docs/examples_notebooks/index_migration.ipynb +++ b/docs/examples_notebooks/index_migration.ipynb @@ -206,9 +206,8 @@ "metadata": {}, "outputs": [], "source": [ - "from datashaper import NoopVerbCallbacks\n", - "\n", "from graphrag.cache.factory import create_cache\n", + "from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks\n", "from graphrag.index.flows.generate_text_embeddings import generate_text_embeddings\n", "\n", "# We only need to re-run the embeddings workflow, to ensure that embeddings for all required search fields are in place\n", diff --git a/docs/index/architecture.md b/docs/index/architecture.md index 12c6015012..2d5d110ba7 100644 --- a/docs/index/architecture.md +++ b/docs/index/architecture.md @@ -8,33 +8,9 @@ In order to support the GraphRAG system, the outputs of the indexing engine (in This model is designed to be an abstraction over the underlying data storage technology, and to provide a common interface for the GraphRAG system to interact with. In normal use-cases the outputs of the GraphRAG Indexer would be loaded into a database system, and the GraphRAG's Query Engine would interact with the database using the knowledge model data-store types. -### DataShaper Workflows - -GraphRAG's Indexing Pipeline is built on top of our open-source library, [DataShaper](https://github.com/microsoft/datashaper). -DataShaper is a data processing library that allows users to declaratively express data pipelines, schemas, and related assets using well-defined schemas. -DataShaper has implementations in JavaScript and Python, and is designed to be extensible to other languages. - -One of the core resource types within DataShaper is a [Workflow](https://github.com/microsoft/datashaper/blob/main/javascript/schema/src/workflow/WorkflowSchema.ts). -Workflows are expressed as sequences of steps, which we call [verbs](https://github.com/microsoft/datashaper/blob/main/javascript/schema/src/workflow/verbs.ts). -Each step has a verb name and a configuration object. -In DataShaper, these verbs model relational concepts such as SELECT, DROP, JOIN, etc.. Each verb transforms an input data table, and that table is passed down the pipeline. - -```mermaid ---- -title: Sample Workflow ---- -flowchart LR - input[Input Table] --> select[SELECT] --> join[JOIN] --> binarize[BINARIZE] --> output[Output Table] -``` - -### LLM-based Workflow Steps - -GraphRAG's Indexing Pipeline implements a handful of custom verbs on top of the standard, relational verbs that our DataShaper library provides. These verbs give us the ability to augment text documents with rich, structured data using the power of LLMs such as GPT-4. We utilize these verbs in our standard workflow to extract entities, relationships, claims, community structures, and community reports and summaries. This behavior is customizable and can be extended to support many kinds of AI-based data enrichment and extraction tasks. - -### Workflow Graphs +### Workflows Because of the complexity of our data indexing tasks, we needed to be able to express our data pipeline as series of multiple, interdependent workflows. -In the GraphRAG Indexing Pipeline, each workflow may define dependencies on other workflows, effectively forming a directed acyclic graph (DAG) of workflows, which is then used to schedule processing. ```mermaid --- @@ -55,7 +31,7 @@ stateDiagram-v2 The primary unit of communication between workflows, and between workflow steps is an instance of `pandas.DataFrame`. Although side-effects are possible, our goal is to be _data-centric_ and _table-centric_ in our approach to data processing. This allows us to easily reason about our data, and to leverage the power of dataframe-based ecosystems. -Our underlying dataframe technology may change over time, but our primary goal is to support the DataShaper workflow schema while retaining single-machine ease of use and developer ergonomics. +Our underlying dataframe technology may change over time, but our primary goal is to support the workflow schema while retaining single-machine ease of use and developer ergonomics. ### LLM Caching diff --git a/examples/README.md b/examples/README.md deleted file mode 100644 index 5d80b4fd4c..0000000000 --- a/examples/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Indexing Engine Examples -This directory contains several examples of how to use the indexing engine. - -Most examples include two different forms of running the pipeline, both are contained in the examples `run.py` -1. Using mostly the Python API -2. Using mostly the a pipeline configuration file - -# Running an Example -First run `poetry shell` to activate a virtual environment with the required dependencies. - -Then run `PYTHONPATH="$(pwd)" python examples/path_to_example/run.py` from the `python/graphrag` directory. - -For example to run the single_verb example, you would run the following commands: - -```bash -cd python/graphrag -poetry shell -PYTHONPATH="$(pwd)" python examples/single_verb/run.py -``` \ No newline at end of file diff --git a/examples/__init__.py b/examples/__init__.py deleted file mode 100644 index 0a3e38adfb..0000000000 --- a/examples/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License diff --git a/examples/custom_input/__init__.py b/examples/custom_input/__init__.py deleted file mode 100644 index 0a3e38adfb..0000000000 --- a/examples/custom_input/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License diff --git a/examples/custom_input/pipeline.yml b/examples/custom_input/pipeline.yml deleted file mode 100644 index 80340c8291..0000000000 --- a/examples/custom_input/pipeline.yml +++ /dev/null @@ -1,24 +0,0 @@ - -# Setup reporting however you'd like -reporting: - type: console - -# Setup storage however you'd like -storage: - type: memory - -# Setup cache however you'd like -cache: - type: memory - -# Just a simple workflow -workflows: - - # This is an anonymous workflow, it doesn't have a name - - steps: - - # Unpack the nodes from the graph - - verb: fill - args: - to: filled_column - value: "Filled Value" \ No newline at end of file diff --git a/examples/custom_input/run.py b/examples/custom_input/run.py deleted file mode 100644 index debb022379..0000000000 --- a/examples/custom_input/run.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -import asyncio -import os - -import pandas as pd - -from graphrag.index.run import run_pipeline_with_config - -pipeline_file = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "./pipeline.yml" -) - - -async def run(): - # Load your dataset - dataset = _load_dataset_some_unique_way() - - # Load your config without the input section - config = pipeline_file - - # Grab the last result from the pipeline, should be our entity extraction - outputs = [] - async for output in run_pipeline_with_config( - config_or_path=config, dataset=dataset - ): - outputs.append(output) - pipeline_result = outputs[-1] - - if pipeline_result.result is not None: - # Should look something like - # col1 col2 filled_column - # 0 2 4 Filled Value - # 1 5 10 Filled Value - print(pipeline_result.result) - else: - print("No results!") - - -def _load_dataset_some_unique_way() -> pd.DataFrame: - # Totally loaded from some other place - return pd.DataFrame([{"col1": 2, "col2": 4}, {"col1": 5, "col2": 10}]) - - -if __name__ == "__main__": - asyncio.run(run()) diff --git a/examples/single_verb/input/data.csv b/examples/single_verb/input/data.csv deleted file mode 100644 index d1aaf77bfe..0000000000 --- a/examples/single_verb/input/data.csv +++ /dev/null @@ -1,3 +0,0 @@ -col1,col2 -2,4 -5,10 \ No newline at end of file diff --git a/examples/single_verb/pipeline.yml b/examples/single_verb/pipeline.yml deleted file mode 100644 index 9e8046124d..0000000000 --- a/examples/single_verb/pipeline.yml +++ /dev/null @@ -1,12 +0,0 @@ -input: - file_type: csv - base_dir: ./input - file_pattern: .*\.csv$ -workflows: - - steps: - - verb: derive # https://github.com/microsoft/datashaper/blob/main/python/datashaper/datashaper/verbs/derive.py - args: - column1: "col1" - column2: "col2" - to: "col_multiplied" - operator: "*" diff --git a/examples/single_verb/run.py b/examples/single_verb/run.py deleted file mode 100644 index 99f8137a98..0000000000 --- a/examples/single_verb/run.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -import asyncio -import os - -import pandas as pd - -from graphrag.index.config.workflow import PipelineWorkflowReference -from graphrag.index.run import run_pipeline, run_pipeline_with_config - -# our fake dataset -dataset = pd.DataFrame([{"col1": 2, "col2": 4}, {"col1": 5, "col2": 10}]) - - -async def run_with_config(): - """Run a pipeline with a config file""" - # load pipeline.yml in this directory - config_path = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "./pipeline.yml" - ) - - tables = [] - async for table in run_pipeline_with_config( - config_or_path=config_path, dataset=dataset - ): - tables.append(table) - pipeline_result = tables[-1] - - if pipeline_result.result is not None: - # Should look something like this, which should be identical to the python example: - # col1 col2 col_multiplied - # 0 2 4 8 - # 1 5 10 50 - print(pipeline_result.result) - else: - print("No results!") - - -async def run_python(): - """Run a pipeline using the python API""" - workflows: list[PipelineWorkflowReference] = [ - PipelineWorkflowReference( - steps=[ - { - # built-in verb - "verb": "derive", # https://github.com/microsoft/datashaper/blob/main/python/datashaper/datashaper/verbs/derive.py - "args": { - "column1": "col1", # from above - "column2": "col2", # from above - "to": "col_multiplied", # new column name - "operator": "*", # multiply the two columns - }, - # Since we're trying to act on the default input, we don't need explicitly to specify an input - } - ] - ), - ] - - # Grab the last result from the pipeline, should be our entity extraction - tables = [] - async for table in run_pipeline(dataset=dataset, workflows=workflows): - tables.append(table) - pipeline_result = tables[-1] - - if pipeline_result.result is not None: - # Should look something like this: - # col1 col2 col_multiplied - # 0 2 4 8 - # 1 5 10 50 - print(pipeline_result.result) - else: - print("No results!") - - -if __name__ == "__main__": - asyncio.run(run_with_config()) - asyncio.run(run_python()) diff --git a/examples/use_built_in_workflows/__init__.py b/examples/use_built_in_workflows/__init__.py deleted file mode 100644 index 0a3e38adfb..0000000000 --- a/examples/use_built_in_workflows/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License diff --git a/examples/use_built_in_workflows/pipeline.yml b/examples/use_built_in_workflows/pipeline.yml deleted file mode 100644 index cb1896857f..0000000000 --- a/examples/use_built_in_workflows/pipeline.yml +++ /dev/null @@ -1,23 +0,0 @@ -workflows: - - name: "entity_extraction" - config: - entity_extract: - strategy: - type: "nltk" - - - name: "entity_graph" - config: - cluster_graph: - strategy: - type: "leiden" - embed_graph: - strategy: - type: "node2vec" - num_walks: 10 - walk_length: 40 - window_size: 2 - iterations: 3 - random_seed: 597832 - layout_graph: - strategy: - type: "umap" \ No newline at end of file diff --git a/examples/use_built_in_workflows/run.py b/examples/use_built_in_workflows/run.py deleted file mode 100644 index adda7f6b4c..0000000000 --- a/examples/use_built_in_workflows/run.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -import asyncio -import os - -from graphrag.index.config.input import PipelineCSVInputConfig -from graphrag.index.config.workflow import PipelineWorkflowReference -from graphrag.index.input.factory import create_input -from graphrag.index.run import run_pipeline, run_pipeline_with_config - -sample_data_dir = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "../_sample_data/" -) - -# Load our dataset once -shared_dataset = asyncio.run( - create_input( - PipelineCSVInputConfig( - file_pattern=".*\\.csv$", - base_dir=sample_data_dir, - source_column="author", - text_column="message", - timestamp_column="date(yyyyMMddHHmmss)", - timestamp_format="%Y%m%d%H%M%S", - title_column="message", - ), - ) -) - - -async def run_with_config(): - """Run a pipeline with a config file""" - # We're cheap, and this is an example, lets just do 10 - dataset = shared_dataset.head(10) - - # load pipeline.yml in this directory - config_path = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "./pipeline.yml" - ) - - # Grab the last result from the pipeline, should be our entity extraction - tables = [] - async for table in run_pipeline_with_config( - config_or_path=config_path, dataset=dataset - ): - tables.append(table) - pipeline_result = tables[-1] - - if pipeline_result.result is not None: - # The output of this should match the run_python() example - first_result = pipeline_result.result.head(1) - print(f"level: {first_result['level'][0]}") - print(f"embeddings: {first_result['embeddings'][0]}") - print(f"entity_graph_positions: {first_result['node_positions'][0]}") - else: - print("No results!") - - -async def run_python(): - # We're cheap, and this is an example, lets just do 10 - dataset = shared_dataset.head(10) - - workflows: list[PipelineWorkflowReference] = [ - # This workflow reference here is only necessary - # because we want to customize the entity_extraction workflow is configured - # otherwise, it can be omitted, but you're stuck with the default configuration for entity_extraction - PipelineWorkflowReference( - name="entity_extraction", - config={ - "entity_extract": { - "strategy": { - "type": "nltk", - } - } - }, - ), - PipelineWorkflowReference( - name="entity_graph", - config={ - "cluster_graph": {"strategy": {"type": "leiden"}}, - "embed_graph": { - "strategy": { - "type": "node2vec", - "num_walks": 10, - "walk_length": 40, - "window_size": 2, - "iterations": 3, - "random_seed": 597832, - } - }, - "layout_graph": { - "strategy": { - "type": "umap", - }, - }, - }, - ), - ] - - # Grab the last result from the pipeline, should be our entity extraction - tables = [] - async for table in run_pipeline(dataset=dataset, workflows=workflows): - tables.append(table) - pipeline_result = tables[-1] - - # The output will contain entity graphs per hierarchical level, with embeddings per entity - if pipeline_result.result is not None: - first_result = pipeline_result.result.head(1) - print(f"level: {first_result['level'][0]}") - print(f"embeddings: {first_result['embeddings'][0]}") - print(f"entity_graph_positions: {first_result['node_positions'][0]}") - else: - print("No results!") - - -if __name__ == "__main__": - asyncio.run(run_python()) - asyncio.run(run_with_config()) diff --git a/graphrag/api/__init__.py b/graphrag/api/__init__.py index 6165122e5c..a6ed5764c1 100644 --- a/graphrag/api/__init__.py +++ b/graphrag/api/__init__.py @@ -10,6 +10,8 @@ from graphrag.api.index import build_index from graphrag.api.prompt_tune import generate_indexing_prompts from graphrag.api.query import ( + basic_search, + basic_search_streaming, drift_search, global_search, global_search_streaming, @@ -27,6 +29,8 @@ "local_search", "local_search_streaming", "drift_search", + "basic_search", + "basic_search_streaming", # prompt tuning API "DocSelectionType", "generate_indexing_prompts", diff --git a/graphrag/api/index.py b/graphrag/api/index.py index 48c6be663b..2ad41c2f40 100644 --- a/graphrag/api/index.py +++ b/graphrag/api/index.py @@ -10,14 +10,11 @@ import logging -from datashaper import WorkflowCallbacks - from graphrag.cache.noop_pipeline_cache import NoopPipelineCache from graphrag.callbacks.factory import create_pipeline_reporter +from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks from graphrag.config.enums import CacheType from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.create_pipeline_config import create_pipeline_config -from graphrag.index.run import run_pipeline_with_config from graphrag.index.run.run_workflows import run_workflows from graphrag.index.typing import PipelineRunResult from graphrag.logger.base import ProgressLogger @@ -32,7 +29,6 @@ async def build_index( memory_profile: bool = False, callbacks: list[WorkflowCallbacks] | None = None, progress_logger: ProgressLogger | None = None, - new_pipeline: bool = False, ) -> list[PipelineRunResult]: """Run the pipeline with the given configuration. @@ -71,41 +67,23 @@ async def build_index( callbacks.append(create_pipeline_reporter(config.reporting, None)) # type: ignore outputs: list[PipelineRunResult] = [] - if new_pipeline: - log.info("RUNNING NEW WORKFLOWS WITHOUT DATASHAPER") - async for output in run_workflows( - config, - cache=pipeline_cache, - callbacks=callbacks, - logger=progress_logger, - run_id=run_id, - ): - outputs.append(output) - if progress_logger: - if output.errors and len(output.errors) > 0: - progress_logger.error(output.workflow) - else: - progress_logger.success(output.workflow) - progress_logger.info(str(output.result)) - else: - log.info("RUNNING ORIGINAL PIPELINE") - pipeline_config = create_pipeline_config(config) - async for output in run_pipeline_with_config( - pipeline_config, - run_id=run_id, - memory_profile=memory_profile, - cache=pipeline_cache, - callbacks=callbacks, - logger=progress_logger, - is_resume_run=is_resume_run, - is_update_run=is_update_run, - ): - outputs.append(output) - if progress_logger: - if output.errors and len(output.errors) > 0: - progress_logger.error(output.workflow) - else: - progress_logger.success(output.workflow) - progress_logger.info(str(output.result)) + if memory_profile: + log.warning("New pipeline does not yet support memory profiling.") + + async for output in run_workflows( + config, + cache=pipeline_cache, + callbacks=callbacks, + logger=progress_logger, + run_id=run_id, + is_update_run=is_update_run, + ): + outputs.append(output) + if progress_logger: + if output.errors and len(output.errors) > 0: + progress_logger.error(output.workflow) + else: + progress_logger.success(output.workflow) + progress_logger.info(str(output.result)) return outputs diff --git a/graphrag/api/prompt_tune.py b/graphrag/api/prompt_tune.py index 9d0823e93f..98c1dac3ba 100644 --- a/graphrag/api/prompt_tune.py +++ b/graphrag/api/prompt_tune.py @@ -11,9 +11,9 @@ Backwards compatibility is not guaranteed at this time. """ -from datashaper import NoopVerbCallbacks from pydantic import PositiveInt, validate_call +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks from graphrag.config.models.graph_rag_config import GraphRagConfig from graphrag.index.llm.load_llm import load_llm from graphrag.logger.print_progress import PrintProgressLogger diff --git a/graphrag/api/query.py b/graphrag/api/query.py index 1f5899a646..f41eec3e9b 100644 --- a/graphrag/api/query.py +++ b/graphrag/api/query.py @@ -28,9 +28,11 @@ from graphrag.index.config.embeddings import ( community_full_content_embedding, entity_description_embedding, + text_unit_text_embedding, ) from graphrag.logger.print_progress import PrintProgressLogger from graphrag.query.factory import ( + get_basic_search_engine, get_drift_search_engine, get_global_search_engine, get_local_search_engine, @@ -423,6 +425,109 @@ async def drift_search( return response, context_data +@validate_call(config={"arbitrary_types_allowed": True}) +async def basic_search( + config: GraphRagConfig, + text_units: pd.DataFrame, + query: str, +) -> tuple[ + str | dict[str, Any] | list[dict[str, Any]], + str | list[pd.DataFrame] | dict[str, pd.DataFrame], +]: + """Perform a basic search and return the context data and response. + + Parameters + ---------- + - config (GraphRagConfig): A graphrag configuration (from settings.yaml) + - text_units (pd.DataFrame): A DataFrame containing the final text units (from create_final_text_units.parquet) + - response_type (str): The response type to return. + - query (str): The user query to search for. + + Returns + ------- + TODO: Document the search response type and format. + + Raises + ------ + TODO: Document any exceptions to expect. + """ + vector_store_args = config.embeddings.vector_store + logger.info(f"Vector Store Args: {redact(vector_store_args)}") # type: ignore # noqa + + description_embedding_store = _get_embedding_store( + config_args=vector_store_args, # type: ignore + embedding_name=text_unit_text_embedding, + ) + + prompt = _load_search_prompt(config.root_dir, config.basic_search.prompt) + + search_engine = get_basic_search_engine( + config=config, + text_units=read_indexer_text_units(text_units), + text_unit_embeddings=description_embedding_store, + system_prompt=prompt, + ) + + result: SearchResult = await search_engine.asearch(query=query) + response = result.response + context_data = _reformat_context_data(result.context_data) # type: ignore + return response, context_data + + +@validate_call(config={"arbitrary_types_allowed": True}) +async def basic_search_streaming( + config: GraphRagConfig, + text_units: pd.DataFrame, + query: str, +) -> AsyncGenerator: + """Perform a local search and return the context data and response via a generator. + + Parameters + ---------- + - config (GraphRagConfig): A graphrag configuration (from settings.yaml) + - text_units (pd.DataFrame): A DataFrame containing the final text units (from create_final_text_units.parquet) + - query (str): The user query to search for. + + Returns + ------- + TODO: Document the search response type and format. + + Raises + ------ + TODO: Document any exceptions to expect. + """ + vector_store_args = config.embeddings.vector_store + logger.info(f"Vector Store Args: {redact(vector_store_args)}") # type: ignore # noqa + + description_embedding_store = _get_embedding_store( + config_args=vector_store_args, # type: ignore + embedding_name=text_unit_text_embedding, + ) + + prompt = _load_search_prompt(config.root_dir, config.basic_search.prompt) + + search_engine = get_basic_search_engine( + config=config, + text_units=read_indexer_text_units(text_units), + text_unit_embeddings=description_embedding_store, + system_prompt=prompt, + ) + + search_result = search_engine.astream_search(query=query) + + # when streaming results, a context data object is returned as the first result + # and the query response in subsequent tokens + context_data = None + get_context_data = True + async for stream_chunk in search_result: + if get_context_data: + context_data = _reformat_context_data(stream_chunk) # type: ignore + yield context_data + get_context_data = False + else: + yield stream_chunk + + def _get_embedding_store( config_args: dict, embedding_name: str, diff --git a/graphrag/callbacks/blob_workflow_callbacks.py b/graphrag/callbacks/blob_workflow_callbacks.py index 56ed317a9f..36bd5f9e83 100644 --- a/graphrag/callbacks/blob_workflow_callbacks.py +++ b/graphrag/callbacks/blob_workflow_callbacks.py @@ -10,7 +10,8 @@ from azure.identity import DefaultAzureCredential from azure.storage.blob import BlobServiceClient -from datashaper import NoopWorkflowCallbacks + +from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks class BlobWorkflowCallbacks(NoopWorkflowCallbacks): diff --git a/graphrag/callbacks/console_workflow_callbacks.py b/graphrag/callbacks/console_workflow_callbacks.py index 4e70ba7109..a2ab6ef08a 100644 --- a/graphrag/callbacks/console_workflow_callbacks.py +++ b/graphrag/callbacks/console_workflow_callbacks.py @@ -3,7 +3,7 @@ """A logger that emits updates from the indexing engine to the console.""" -from datashaper import NoopWorkflowCallbacks +from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks class ConsoleWorkflowCallbacks(NoopWorkflowCallbacks): diff --git a/graphrag/callbacks/delegating_verb_callbacks.py b/graphrag/callbacks/delegating_verb_callbacks.py new file mode 100644 index 0000000000..11687f3a24 --- /dev/null +++ b/graphrag/callbacks/delegating_verb_callbacks.py @@ -0,0 +1,46 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Contains the DelegatingVerbCallback definition.""" + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks +from graphrag.logger.progress import Progress + + +class DelegatingVerbCallbacks(VerbCallbacks): + """A wrapper that implements VerbCallbacks that delegates to the underlying WorkflowCallbacks.""" + + _workflow_callbacks: WorkflowCallbacks + _name: str + + def __init__(self, name: str, workflow_callbacks: WorkflowCallbacks): + """Create a new instance of DelegatingVerbCallbacks.""" + self._workflow_callbacks = workflow_callbacks + self._name = name + + def progress(self, progress: Progress) -> None: + """Handle when progress occurs.""" + self._workflow_callbacks.on_step_progress(self._name, progress) + + def error( + self, + message: str, + cause: BaseException | None = None, + stack: str | None = None, + details: dict | None = None, + ) -> None: + """Handle when an error occurs.""" + self._workflow_callbacks.on_error(message, cause, stack, details) + + def warning(self, message: str, details: dict | None = None) -> None: + """Handle when a warning occurs.""" + self._workflow_callbacks.on_warning(message, details) + + def log(self, message: str, details: dict | None = None) -> None: + """Handle when a log occurs.""" + self._workflow_callbacks.on_log(message, details) + + def measure(self, name: str, value: float, details: dict | None = None) -> None: + """Handle when a measurement occurs.""" + self._workflow_callbacks.on_measure(name, value, details) diff --git a/graphrag/callbacks/factory.py b/graphrag/callbacks/factory.py index 26b33b713b..bffc3f2cc2 100644 --- a/graphrag/callbacks/factory.py +++ b/graphrag/callbacks/factory.py @@ -6,11 +6,10 @@ from pathlib import Path from typing import cast -from datashaper import WorkflowCallbacks - from graphrag.callbacks.blob_workflow_callbacks import BlobWorkflowCallbacks from graphrag.callbacks.console_workflow_callbacks import ConsoleWorkflowCallbacks from graphrag.callbacks.file_workflow_callbacks import FileWorkflowCallbacks +from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks from graphrag.config.enums import ReportingType from graphrag.index.config.reporting import ( PipelineBlobReportingConfig, diff --git a/graphrag/callbacks/file_workflow_callbacks.py b/graphrag/callbacks/file_workflow_callbacks.py index 95ccfea272..b3b5ca1963 100644 --- a/graphrag/callbacks/file_workflow_callbacks.py +++ b/graphrag/callbacks/file_workflow_callbacks.py @@ -8,7 +8,7 @@ from io import TextIOWrapper from pathlib import Path -from datashaper import NoopWorkflowCallbacks +from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks log = logging.getLogger(__name__) diff --git a/graphrag/callbacks/noop_verb_callbacks.py b/graphrag/callbacks/noop_verb_callbacks.py new file mode 100644 index 0000000000..5a2000af67 --- /dev/null +++ b/graphrag/callbacks/noop_verb_callbacks.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Defines the interface for verb callbacks.""" + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.logger.progress import Progress + + +class NoopVerbCallbacks(VerbCallbacks): + """A noop implementation of the verb callbacks.""" + + def __init__(self) -> None: + pass + + def progress(self, progress: Progress) -> None: + """Report a progress update from the verb execution".""" + + def error( + self, + message: str, + cause: BaseException | None = None, + stack: str | None = None, + details: dict | None = None, + ) -> None: + """Report a error from the verb execution.""" + + def warning(self, message: str, details: dict | None = None) -> None: + """Report a warning from verb execution.""" + + def log(self, message: str, details: dict | None = None) -> None: + """Report an informational message from the verb execution.""" + + def measure(self, name: str, value: float) -> None: + """Report a telemetry measurement from the verb execution.""" diff --git a/graphrag/callbacks/noop_workflow_callbacks.py b/graphrag/callbacks/noop_workflow_callbacks.py new file mode 100644 index 0000000000..2e8d6b883d --- /dev/null +++ b/graphrag/callbacks/noop_workflow_callbacks.py @@ -0,0 +1,46 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A no-op implementation of WorkflowCallbacks.""" + +from typing import Any + +from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks +from graphrag.logger.progress import Progress + + +class NoopWorkflowCallbacks(WorkflowCallbacks): + """A no-op implementation of WorkflowCallbacks.""" + + def on_workflow_start(self, name: str, instance: object) -> None: + """Execute this callback when a workflow starts.""" + + def on_workflow_end(self, name: str, instance: object) -> None: + """Execute this callback when a workflow ends.""" + + def on_step_start(self, step_name: str) -> None: + """Execute this callback every time a step starts.""" + + def on_step_end(self, step_name: str, result: Any) -> None: + """Execute this callback every time a step ends.""" + + def on_step_progress(self, step_name: str, progress: Progress) -> None: + """Handle when progress occurs.""" + + def on_error( + self, + message: str, + cause: BaseException | None = None, + stack: str | None = None, + details: dict | None = None, + ) -> None: + """Handle when an error occurs.""" + + def on_warning(self, message: str, details: dict | None = None) -> None: + """Handle when a warning occurs.""" + + def on_log(self, message: str, details: dict | None = None) -> None: + """Handle when a log message occurs.""" + + def on_measure(self, name: str, value: float, details: dict | None = None) -> None: + """Handle when a measurement occurs.""" diff --git a/graphrag/callbacks/progress_workflow_callbacks.py b/graphrag/callbacks/progress_workflow_callbacks.py index 9fda1e0c06..1dc4ada022 100644 --- a/graphrag/callbacks/progress_workflow_callbacks.py +++ b/graphrag/callbacks/progress_workflow_callbacks.py @@ -5,9 +5,9 @@ from typing import Any -from datashaper import ExecutionNode, NoopWorkflowCallbacks, Progress, TableContainer - +from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks from graphrag.logger.base import ProgressLogger +from graphrag.logger.progress import Progress class ProgressWorkflowCallbacks(NoopWorkflowCallbacks): @@ -39,16 +39,15 @@ def on_workflow_end(self, name: str, instance: object) -> None: """Execute this callback when a workflow ends.""" self._pop() - def on_step_start(self, node: ExecutionNode, inputs: dict[str, Any]) -> None: + def on_step_start(self, step_name: str) -> None: """Execute this callback every time a step starts.""" - verb_id_str = f" ({node.node_id})" if node.has_explicit_id else "" - self._push(f"Verb {node.verb.name}{verb_id_str}") + self._push(f"Step {step_name}") self._latest(Progress(percent=0)) - def on_step_end(self, node: ExecutionNode, result: TableContainer | None) -> None: + def on_step_end(self, step_name: str, result: Any) -> None: """Execute this callback every time a step ends.""" self._pop() - def on_step_progress(self, node: ExecutionNode, progress: Progress) -> None: + def on_step_progress(self, step_name: str, progress: Progress) -> None: """Handle when progress occurs.""" self._latest(progress) diff --git a/graphrag/callbacks/verb_callbacks.py b/graphrag/callbacks/verb_callbacks.py new file mode 100644 index 0000000000..9489b4cab3 --- /dev/null +++ b/graphrag/callbacks/verb_callbacks.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Defines the interface for verb callbacks.""" + +from typing import Protocol + +from graphrag.logger.progress import Progress + + +class VerbCallbacks(Protocol): + """Provides a way to report status updates from the pipeline.""" + + def progress(self, progress: Progress) -> None: + """Report a progress update from the verb execution".""" + ... + + def error( + self, + message: str, + cause: BaseException | None = None, + stack: str | None = None, + details: dict | None = None, + ) -> None: + """Report a error from the verb execution.""" + ... + + def warning(self, message: str, details: dict | None = None) -> None: + """Report a warning from verb execution.""" + ... + + def log(self, message: str, details: dict | None = None) -> None: + """Report an informational message from the verb execution.""" + ... + + def measure(self, name: str, value: float) -> None: + """Report a telemetry measurement from the verb execution.""" + ... diff --git a/graphrag/callbacks/workflow_callbacks.py b/graphrag/callbacks/workflow_callbacks.py new file mode 100644 index 0000000000..f1adec6cb6 --- /dev/null +++ b/graphrag/callbacks/workflow_callbacks.py @@ -0,0 +1,58 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Collection of callbacks that can be used to monitor the workflow execution.""" + +from typing import Any, Protocol + +from graphrag.logger.progress import Progress + + +class WorkflowCallbacks(Protocol): + """ + A collection of callbacks that can be used to monitor the workflow execution. + + This base class is a "noop" implementation so that clients may implement just the callbacks they need. + """ + + def on_workflow_start(self, name: str, instance: object) -> None: + """Execute this callback when a workflow starts.""" + ... + + def on_workflow_end(self, name: str, instance: object) -> None: + """Execute this callback when a workflow ends.""" + ... + + def on_step_start(self, step_name: str) -> None: + """Execute this callback every time a step starts.""" + ... + + def on_step_end(self, step_name: str, result: Any) -> None: + """Execute this callback every time a step ends.""" + ... + + def on_step_progress(self, step_name: str, progress: Progress) -> None: + """Handle when progress occurs.""" + ... + + def on_error( + self, + message: str, + cause: BaseException | None = None, + stack: str | None = None, + details: dict | None = None, + ) -> None: + """Handle when an error occurs.""" + ... + + def on_warning(self, message: str, details: dict | None = None) -> None: + """Handle when a warning occurs.""" + ... + + def on_log(self, message: str, details: dict | None = None) -> None: + """Handle when a log message occurs.""" + ... + + def on_measure(self, name: str, value: float, details: dict | None = None) -> None: + """Handle when a measurement occurs.""" + ... diff --git a/graphrag/callbacks/workflow_callbacks_manager.py b/graphrag/callbacks/workflow_callbacks_manager.py new file mode 100644 index 0000000000..d677462cb7 --- /dev/null +++ b/graphrag/callbacks/workflow_callbacks_manager.py @@ -0,0 +1,83 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing the WorkflowCallbacks registry.""" + +from typing import Any + +from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks +from graphrag.logger.progress import Progress + + +class WorkflowCallbacksManager(WorkflowCallbacks): + """A registry of WorkflowCallbacks.""" + + _callbacks: list[WorkflowCallbacks] + + def __init__(self): + """Create a new instance of WorkflowCallbacksRegistry.""" + self._callbacks = [] + + def register(self, callbacks: WorkflowCallbacks) -> None: + """Register a new WorkflowCallbacks type.""" + self._callbacks.append(callbacks) + + def on_workflow_start(self, name: str, instance: object) -> None: + """Execute this callback when a workflow starts.""" + for callback in self._callbacks: + if hasattr(callback, "on_workflow_start"): + callback.on_workflow_start(name, instance) + + def on_workflow_end(self, name: str, instance: object) -> None: + """Execute this callback when a workflow ends.""" + for callback in self._callbacks: + if hasattr(callback, "on_workflow_end"): + callback.on_workflow_end(name, instance) + + def on_step_start(self, step_name: str) -> None: + """Execute this callback every time a step starts.""" + for callback in self._callbacks: + if hasattr(callback, "on_step_start"): + callback.on_step_start(step_name) + + def on_step_end(self, step_name: str, result: Any) -> None: + """Execute this callback every time a step ends.""" + for callback in self._callbacks: + if hasattr(callback, "on_step_end"): + callback.on_step_end(step_name, result) + + def on_step_progress(self, step_name: str, progress: Progress) -> None: + """Handle when progress occurs.""" + for callback in self._callbacks: + if hasattr(callback, "on_step_progress"): + callback.on_step_progress(step_name, progress) + + def on_error( + self, + message: str, + cause: BaseException | None = None, + stack: str | None = None, + details: dict | None = None, + ) -> None: + """Handle when an error occurs.""" + for callback in self._callbacks: + if hasattr(callback, "on_error"): + callback.on_error(message, cause, stack, details) + + def on_warning(self, message: str, details: dict | None = None) -> None: + """Handle when a warning occurs.""" + for callback in self._callbacks: + if hasattr(callback, "on_warning"): + callback.on_warning(message, details) + + def on_log(self, message: str, details: dict | None = None) -> None: + """Handle when a log message occurs.""" + for callback in self._callbacks: + if hasattr(callback, "on_log"): + callback.on_log(message, details) + + def on_measure(self, name: str, value: float, details: dict | None = None) -> None: + """Handle when a measurement occurs.""" + for callback in self._callbacks: + if hasattr(callback, "on_measure"): + callback.on_measure(name, value, details) diff --git a/graphrag/cli/index.py b/graphrag/cli/index.py index 64eb0fc5e8..260161b98e 100644 --- a/graphrag/cli/index.py +++ b/graphrag/cli/index.py @@ -74,7 +74,6 @@ def index_cli( dry_run: bool, skip_validation: bool, output_dir: Path | None, - new_pipeline: bool, ): """Run the pipeline with the given config.""" config = load_config(root_dir, config_filepath) @@ -89,7 +88,6 @@ def index_cli( dry_run=dry_run, skip_validation=skip_validation, output_dir=output_dir, - new_pipeline=new_pipeline, ) @@ -126,7 +124,6 @@ def update_cli( dry_run=False, skip_validation=skip_validation, output_dir=output_dir, - new_pipeline=False, ) @@ -140,7 +137,6 @@ def _run_index( dry_run, skip_validation, output_dir, - new_pipeline, ): progress_logger = LoggerFactory().create_logger(logger) info, error, success = _logger(progress_logger) @@ -186,7 +182,6 @@ def _run_index( is_resume_run=bool(resume), memory_profile=memprofile, progress_logger=progress_logger, - new_pipeline=new_pipeline, ) ) encountered_errors = any( diff --git a/graphrag/cli/initialize.py b/graphrag/cli/initialize.py index c7ce4326e8..a46664b6d0 100644 --- a/graphrag/cli/initialize.py +++ b/graphrag/cli/initialize.py @@ -13,6 +13,7 @@ ) from graphrag.prompts.index.entity_extraction import GRAPH_EXTRACTION_PROMPT from graphrag.prompts.index.summarize_descriptions import SUMMARIZE_PROMPT +from graphrag.prompts.query.basic_search_system_prompt import BASIC_SEARCH_SYSTEM_PROMPT from graphrag.prompts.query.drift_search_system_prompt import DRIFT_LOCAL_SYSTEM_PROMPT from graphrag.prompts.query.global_search_knowledge_system_prompt import ( GENERAL_KNOWLEDGE_INSTRUCTION, @@ -60,6 +61,7 @@ def initialize_project_at(path: Path) -> None: "global_search_reduce_system_prompt": REDUCE_SYSTEM_PROMPT, "global_search_knowledge_system_prompt": GENERAL_KNOWLEDGE_INSTRUCTION, "local_search_system_prompt": LOCAL_SEARCH_SYSTEM_PROMPT, + "basic_search_system_prompt": BASIC_SEARCH_SYSTEM_PROMPT, "question_gen_system_prompt": QUESTION_SYSTEM_PROMPT, } diff --git a/graphrag/cli/main.py b/graphrag/cli/main.py index b023220875..7a6da082ac 100644 --- a/graphrag/cli/main.py +++ b/graphrag/cli/main.py @@ -88,6 +88,7 @@ class SearchType(Enum): LOCAL = "local" GLOBAL = "global" DRIFT = "drift" + BASIC = "basic" def __str__(self): """Return the string representation of the enum value.""" @@ -170,7 +171,6 @@ def _index_cli( resolve_path=True, ), ] = None, - new: Annotated[bool, typer.Option(help="Use the new pipeline.")] = False, ): """Build a knowledge graph index.""" from graphrag.cli.index import index_cli @@ -186,7 +186,6 @@ def _index_cli( dry_run=dry_run, skip_validation=skip_validation, output_dir=output, - new_pipeline=new, ) @@ -426,7 +425,12 @@ def _query_cli( ] = False, ): """Query a knowledge graph index.""" - from graphrag.cli.query import run_drift_search, run_global_search, run_local_search + from graphrag.cli.query import ( + run_basic_search, + run_drift_search, + run_global_search, + run_local_search, + ) match method: case SearchType.LOCAL: @@ -459,5 +463,13 @@ def _query_cli( streaming=False, # Drift search does not support streaming (yet) query=query, ) + case SearchType.BASIC: + run_basic_search( + config_filepath=config, + data_dir=data, + root_dir=root, + streaming=streaming, + query=query, + ) case _: raise ValueError(INVALID_METHOD_ERROR) diff --git a/graphrag/cli/query.py b/graphrag/cli/query.py index 6193db3a95..1d97b4fcfb 100644 --- a/graphrag/cli/query.py +++ b/graphrag/cli/query.py @@ -16,7 +16,7 @@ from graphrag.index.create_pipeline_config import create_pipeline_config from graphrag.logger.print_progress import PrintProgressLogger from graphrag.storage.factory import StorageFactory -from graphrag.utils.storage import load_table_from_storage +from graphrag.utils.storage import load_table_from_storage, storage_has_table logger = PrintProgressLogger("") @@ -43,10 +43,10 @@ def run_global_search( dataframe_dict = _resolve_output_files( config=config, output_list=[ - "create_final_nodes.parquet", - "create_final_entities.parquet", - "create_final_communities.parquet", - "create_final_community_reports.parquet", + "create_final_nodes", + "create_final_entities", + "create_final_communities", + "create_final_community_reports", ], optional_list=[], ) @@ -127,14 +127,14 @@ def run_local_search( dataframe_dict = _resolve_output_files( config=config, output_list=[ - "create_final_nodes.parquet", - "create_final_community_reports.parquet", - "create_final_text_units.parquet", - "create_final_relationships.parquet", - "create_final_entities.parquet", + "create_final_nodes", + "create_final_community_reports", + "create_final_text_units", + "create_final_relationships", + "create_final_entities", ], optional_list=[ - "create_final_covariates.parquet", + "create_final_covariates", ], ) final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"] @@ -217,11 +217,11 @@ def run_drift_search( dataframe_dict = _resolve_output_files( config=config, output_list=[ - "create_final_nodes.parquet", - "create_final_community_reports.parquet", - "create_final_text_units.parquet", - "create_final_relationships.parquet", - "create_final_entities.parquet", + "create_final_nodes", + "create_final_community_reports", + "create_final_text_units", + "create_final_relationships", + "create_final_entities", ], ) final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"] @@ -257,6 +257,69 @@ def run_drift_search( return response, context_data +def run_basic_search( + config_filepath: Path | None, + data_dir: Path | None, + root_dir: Path, + streaming: bool, + query: str, +): + """Perform a basics search with a given query. + + Loads index files required for basic search and calls the Query API. + """ + root = root_dir.resolve() + config = load_config(root, config_filepath) + config.storage.base_dir = str(data_dir) if data_dir else config.storage.base_dir + resolve_paths(config) + + dataframe_dict = _resolve_output_files( + config=config, + output_list=[ + "create_final_text_units.parquet", + ], + ) + final_text_units: pd.DataFrame = dataframe_dict["create_final_text_units"] + + print(streaming) # noqa: T201 + + # # call the Query API + if streaming: + + async def run_streaming_search(): + full_response = "" + context_data = None + get_context_data = True + async for stream_chunk in api.basic_search_streaming( + config=config, + text_units=final_text_units, + query=query, + ): + if get_context_data: + context_data = stream_chunk + get_context_data = False + else: + full_response += stream_chunk + print(stream_chunk, end="") # noqa: T201 + sys.stdout.flush() # flush output buffer to display text immediately + print() # noqa: T201 + return full_response, context_data + + return asyncio.run(run_streaming_search()) + # not streaming + response, context_data = asyncio.run( + api.basic_search( + config=config, + text_units=final_text_units, + query=query, + ) + ) + logger.success(f"Basic Search Response:\n{response}") + # NOTE: we return the response and context data here purely as a complete demonstration of the API. + # External users should use the API directly to get the response and context data. + return response, context_data + + def _resolve_output_files( config: GraphRagConfig, output_list: list[str], @@ -269,24 +332,20 @@ def _resolve_output_files( storage_obj = StorageFactory().create_storage( storage_type=storage_config["type"], kwargs=storage_config ) - for output_file in output_list: - df_key = output_file.split(".")[0] - df_value = asyncio.run( - load_table_from_storage(name=output_file, storage=storage_obj) - ) - dataframe_dict[df_key] = df_value + for name in output_list: + df_value = asyncio.run(load_table_from_storage(name=name, storage=storage_obj)) + dataframe_dict[name] = df_value # for optional output files, set the dict entry to None instead of erroring out if it does not exist if optional_list: for optional_file in optional_list: - file_exists = asyncio.run(storage_obj.has(optional_file)) - df_key = optional_file.split(".")[0] + file_exists = asyncio.run(storage_has_table(optional_file, storage_obj)) if file_exists: df_value = asyncio.run( load_table_from_storage(name=optional_file, storage=storage_obj) ) - dataframe_dict[df_key] = df_value + dataframe_dict[optional_file] = df_value else: - dataframe_dict[df_key] = None + dataframe_dict[optional_file] = None return dataframe_dict diff --git a/graphrag/config/create_graphrag_config.py b/graphrag/config/create_graphrag_config.py index 6358fcc788..433da098d6 100644 --- a/graphrag/config/create_graphrag_config.py +++ b/graphrag/config/create_graphrag_config.py @@ -6,14 +6,13 @@ import os from enum import Enum from pathlib import Path -from typing import cast +from typing import Any, cast -from datashaper import AsyncType from environs import Env -from pydantic import TypeAdapter import graphrag.config.defaults as defs from graphrag.config.enums import ( + AsyncType, CacheType, InputFileType, InputType, @@ -28,8 +27,7 @@ AzureApiBaseMissingError, AzureDeploymentNameMissingError, ) -from graphrag.config.input_models.graphrag_config_input import GraphRagConfigInput -from graphrag.config.input_models.llm_config_input import LLMConfigInput +from graphrag.config.models.basic_search_config import BasicSearchConfig from graphrag.config.models.cache_config import CacheConfig from graphrag.config.models.chunking_config import ChunkingConfig, ChunkStrategyType from graphrag.config.models.claim_extraction_config import ClaimExtractionConfig @@ -54,27 +52,24 @@ from graphrag.config.models.umap_config import UmapConfig from graphrag.config.read_dotenv import read_dotenv -InputModelValidator = TypeAdapter(GraphRagConfigInput) - def create_graphrag_config( - values: GraphRagConfigInput | None = None, root_dir: str | None = None + values: dict[str, Any] | None = None, root_dir: str | None = None ) -> GraphRagConfig: """Load Configuration Parameters from a dictionary.""" values = values or {} root_dir = root_dir or str(Path.cwd()) env = _make_env(root_dir) _token_replace(cast("dict", values)) - InputModelValidator.validate_python(values, strict=True) reader = EnvironmentReader(env) - def hydrate_async_type(input: LLMConfigInput, base: AsyncType) -> AsyncType: + def hydrate_async_type(input: dict[str, Any], base: AsyncType) -> AsyncType: value = input.get(Fragment.async_mode) return AsyncType(value) if value else base def hydrate_llm_params( - config: LLMConfigInput, base: LLMParameters + config: dict[str, Any], base: LLMParameters ) -> LLMParameters: with reader.use(config.get("llm")): llm_type = reader.str(Fragment.type) @@ -131,7 +126,7 @@ def hydrate_llm_params( ) def hydrate_embeddings_params( - config: LLMConfigInput, base: LLMParameters + config: dict[str, Any], base: LLMParameters ) -> LLMParameters: with reader.use(config.get("llm")): api_type = reader.str(Fragment.type) or defs.EMBEDDING_TYPE @@ -197,7 +192,7 @@ def hydrate_embeddings_params( ) def hydrate_parallelization_params( - config: LLMConfigInput, base: ParallelizationParameters + config: dict[str, Any], base: ParallelizationParameters ) -> ParallelizationParameters: with reader.use(config.get("parallelization")): return ParallelizationParameters( @@ -636,6 +631,28 @@ def hydrate_parallelization_params( or defs.DRIFT_LOCAL_SEARCH_LLM_MAX_TOKENS, ) + with ( + reader.use(values.get("basic_search")), + reader.envvar_prefix(Section.basic_search), + ): + basic_search_model = BasicSearchConfig( + prompt=reader.str("prompt") or None, + text_unit_prop=reader.float("text_unit_prop") + or defs.BASIC_SEARCH_TEXT_UNIT_PROP, + conversation_history_max_turns=reader.int( + "conversation_history_max_turns" + ) + or defs.BASIC_SEARCH_CONVERSATION_HISTORY_MAX_TURNS, + temperature=reader.float("llm_temperature") + or defs.BASIC_SEARCH_LLM_TEMPERATURE, + top_p=reader.float("llm_top_p") or defs.BASIC_SEARCH_LLM_TOP_P, + n=reader.int("llm_n") or defs.BASIC_SEARCH_LLM_N, + max_tokens=reader.int(Fragment.max_tokens) + or defs.BASIC_SEARCH_MAX_TOKENS, + llm_max_tokens=reader.int("llm_max_tokens") + or defs.BASIC_SEARCH_LLM_MAX_TOKENS, + ) + skip_workflows = reader.list("skip_workflows") or [] return GraphRagConfig( @@ -663,6 +680,7 @@ def hydrate_parallelization_params( local_search=local_search_model, global_search=global_search_model, drift_search=drift_search_model, + basic_search=basic_search_model, ) @@ -731,6 +749,7 @@ class Section(str, Enum): local_search = "LOCAL_SEARCH" global_search = "GLOBAL_SEARCH" drift_search = "DRIFT_SEARCH" + basic_search = "BASIC_SEARCH" def _is_azure(llm_type: LLMType | None) -> bool: diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py index 9da336cca9..73f27dbe33 100644 --- a/graphrag/config/defaults.py +++ b/graphrag/config/defaults.py @@ -5,9 +5,8 @@ from pathlib import Path -from datashaper import AsyncType - from graphrag.config.enums import ( + AsyncType, CacheType, InputFileType, InputType, @@ -161,3 +160,12 @@ DRIFT_LOCAL_SEARCH_LLM_MAX_TOKENS = 2000 DRIFT_N_DEPTH = 3 + +# Basic Search +BASIC_SEARCH_TEXT_UNIT_PROP = 0.5 +BASIC_SEARCH_CONVERSATION_HISTORY_MAX_TURNS = 5 +BASIC_SEARCH_MAX_TOKENS = 12_000 +BASIC_SEARCH_LLM_TEMPERATURE = 0 +BASIC_SEARCH_LLM_TOP_P = 1 +BASIC_SEARCH_LLM_N = 1 +BASIC_SEARCH_LLM_MAX_TOKENS = 2000 diff --git a/graphrag/config/enums.py b/graphrag/config/enums.py index 4ff1e35571..b13da14874 100644 --- a/graphrag/config/enums.py +++ b/graphrag/config/enums.py @@ -114,3 +114,10 @@ class LLMType(str, Enum): def __repr__(self): """Get a string representation.""" return f'"{self.value}"' + + +class AsyncType(str, Enum): + """Enum for the type of async to use.""" + + AsyncIO = "asyncio" + Threaded = "threaded" diff --git a/graphrag/config/init_content.py b/graphrag/config/init_content.py index 7506eb3a7b..3e210459c8 100644 --- a/graphrag/config/init_content.py +++ b/graphrag/config/init_content.py @@ -132,6 +132,9 @@ drift_search: prompt: "prompts/drift_search_system_prompt.txt" + +basic_search: + prompt: "prompts/basic_search_system_prompt.txt" """ INIT_DOTENV = """\ diff --git a/graphrag/config/input_models/__init__.py b/graphrag/config/input_models/__init__.py deleted file mode 100644 index 6c5862a947..0000000000 --- a/graphrag/config/input_models/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Interfaces for Default Config parameterization.""" diff --git a/graphrag/config/input_models/cache_config_input.py b/graphrag/config/input_models/cache_config_input.py deleted file mode 100644 index 4c1ba35c57..0000000000 --- a/graphrag/config/input_models/cache_config_input.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired, TypedDict - -from graphrag.config.enums import CacheType - - -class CacheConfigInput(TypedDict): - """The default configuration section for Cache.""" - - type: NotRequired[CacheType | str | None] - base_dir: NotRequired[str | None] - connection_string: NotRequired[str | None] - container_name: NotRequired[str | None] - storage_account_blob_url: NotRequired[str | None] - cosmosdb_account_url: NotRequired[str | None] diff --git a/graphrag/config/input_models/chunking_config_input.py b/graphrag/config/input_models/chunking_config_input.py deleted file mode 100644 index bbf4fc735f..0000000000 --- a/graphrag/config/input_models/chunking_config_input.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired, TypedDict - - -class ChunkingConfigInput(TypedDict): - """Configuration section for chunking.""" - - size: NotRequired[int | str | None] - overlap: NotRequired[int | str | None] - group_by_columns: NotRequired[list[str] | str | None] - strategy: NotRequired[dict | None] diff --git a/graphrag/config/input_models/claim_extraction_config_input.py b/graphrag/config/input_models/claim_extraction_config_input.py deleted file mode 100644 index 42ff60ea14..0000000000 --- a/graphrag/config/input_models/claim_extraction_config_input.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired - -from graphrag.config.input_models.llm_config_input import LLMConfigInput - - -class ClaimExtractionConfigInput(LLMConfigInput): - """Configuration section for claim extraction.""" - - enabled: NotRequired[bool | None] - prompt: NotRequired[str | None] - description: NotRequired[str | None] - max_gleanings: NotRequired[int | str | None] - strategy: NotRequired[dict | None] - encoding_model: NotRequired[str | None] diff --git a/graphrag/config/input_models/cluster_graph_config_input.py b/graphrag/config/input_models/cluster_graph_config_input.py deleted file mode 100644 index eb6f9cd1c6..0000000000 --- a/graphrag/config/input_models/cluster_graph_config_input.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired, TypedDict - - -class ClusterGraphConfigInput(TypedDict): - """Configuration section for clustering graphs.""" - - max_cluster_size: NotRequired[int | None] - strategy: NotRequired[dict | None] diff --git a/graphrag/config/input_models/community_reports_config_input.py b/graphrag/config/input_models/community_reports_config_input.py deleted file mode 100644 index 4f8297ae33..0000000000 --- a/graphrag/config/input_models/community_reports_config_input.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired - -from graphrag.config.input_models.llm_config_input import LLMConfigInput - - -class CommunityReportsConfigInput(LLMConfigInput): - """Configuration section for community reports.""" - - prompt: NotRequired[str | None] - max_length: NotRequired[int | str | None] - max_input_length: NotRequired[int | str | None] - strategy: NotRequired[dict | None] diff --git a/graphrag/config/input_models/embed_graph_config_input.py b/graphrag/config/input_models/embed_graph_config_input.py deleted file mode 100644 index f8b6ee6faf..0000000000 --- a/graphrag/config/input_models/embed_graph_config_input.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired, TypedDict - - -class EmbedGraphConfigInput(TypedDict): - """The default configuration section for Node2Vec.""" - - enabled: NotRequired[bool | str | None] - num_walks: NotRequired[int | str | None] - walk_length: NotRequired[int | str | None] - window_size: NotRequired[int | str | None] - iterations: NotRequired[int | str | None] - random_seed: NotRequired[int | str | None] - strategy: NotRequired[dict | None] diff --git a/graphrag/config/input_models/entity_extraction_config_input.py b/graphrag/config/input_models/entity_extraction_config_input.py deleted file mode 100644 index dcc2770c21..0000000000 --- a/graphrag/config/input_models/entity_extraction_config_input.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired - -from graphrag.config.input_models.llm_config_input import LLMConfigInput - - -class EntityExtractionConfigInput(LLMConfigInput): - """Configuration section for entity extraction.""" - - prompt: NotRequired[str | None] - entity_types: NotRequired[list[str] | str | None] - max_gleanings: NotRequired[int | str | None] - strategy: NotRequired[dict | None] - encoding_model: NotRequired[str | None] diff --git a/graphrag/config/input_models/global_search_config_input.py b/graphrag/config/input_models/global_search_config_input.py deleted file mode 100644 index e13fbbfa9e..0000000000 --- a/graphrag/config/input_models/global_search_config_input.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired, TypedDict - - -class GlobalSearchConfigInput(TypedDict): - """The default configuration section for Cache.""" - - max_tokens: NotRequired[int | str | None] - data_max_tokens: NotRequired[int | str | None] - map_max_tokens: NotRequired[int | str | None] - reduce_max_tokens: NotRequired[int | str | None] - concurrency: NotRequired[int | str | None] diff --git a/graphrag/config/input_models/graphrag_config_input.py b/graphrag/config/input_models/graphrag_config_input.py deleted file mode 100644 index 9d3094edd7..0000000000 --- a/graphrag/config/input_models/graphrag_config_input.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired - -from graphrag.config.input_models.cache_config_input import CacheConfigInput -from graphrag.config.input_models.chunking_config_input import ChunkingConfigInput -from graphrag.config.input_models.claim_extraction_config_input import ( - ClaimExtractionConfigInput, -) -from graphrag.config.input_models.cluster_graph_config_input import ( - ClusterGraphConfigInput, -) -from graphrag.config.input_models.community_reports_config_input import ( - CommunityReportsConfigInput, -) -from graphrag.config.input_models.embed_graph_config_input import EmbedGraphConfigInput -from graphrag.config.input_models.entity_extraction_config_input import ( - EntityExtractionConfigInput, -) -from graphrag.config.input_models.global_search_config_input import ( - GlobalSearchConfigInput, -) -from graphrag.config.input_models.input_config_input import InputConfigInput -from graphrag.config.input_models.llm_config_input import LLMConfigInput -from graphrag.config.input_models.local_search_config_input import ( - LocalSearchConfigInput, -) -from graphrag.config.input_models.reporting_config_input import ReportingConfigInput -from graphrag.config.input_models.snapshots_config_input import SnapshotsConfigInput -from graphrag.config.input_models.storage_config_input import StorageConfigInput -from graphrag.config.input_models.summarize_descriptions_config_input import ( - SummarizeDescriptionsConfigInput, -) -from graphrag.config.input_models.text_embedding_config_input import ( - TextEmbeddingConfigInput, -) -from graphrag.config.input_models.umap_config_input import UmapConfigInput - - -class GraphRagConfigInput(LLMConfigInput): - """Base class for the Default-Configuration parameterization settings.""" - - reporting: NotRequired[ReportingConfigInput | None] - storage: NotRequired[StorageConfigInput | None] - cache: NotRequired[CacheConfigInput | None] - input: NotRequired[InputConfigInput | None] - embed_graph: NotRequired[EmbedGraphConfigInput | None] - embeddings: NotRequired[TextEmbeddingConfigInput | None] - chunks: NotRequired[ChunkingConfigInput | None] - snapshots: NotRequired[SnapshotsConfigInput | None] - entity_extraction: NotRequired[EntityExtractionConfigInput | None] - summarize_descriptions: NotRequired[SummarizeDescriptionsConfigInput | None] - community_reports: NotRequired[CommunityReportsConfigInput | None] - claim_extraction: NotRequired[ClaimExtractionConfigInput | None] - cluster_graph: NotRequired[ClusterGraphConfigInput | None] - umap: NotRequired[UmapConfigInput | None] - encoding_model: NotRequired[str | None] - skip_workflows: NotRequired[list[str] | str | None] - local_search: NotRequired[LocalSearchConfigInput | None] - global_search: NotRequired[GlobalSearchConfigInput | None] diff --git a/graphrag/config/input_models/input_config_input.py b/graphrag/config/input_models/input_config_input.py deleted file mode 100644 index 4ff89d2c9a..0000000000 --- a/graphrag/config/input_models/input_config_input.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired, TypedDict - -from graphrag.config.enums import InputFileType, InputType - - -class InputConfigInput(TypedDict): - """The default configuration section for Input.""" - - type: NotRequired[InputType | str | None] - file_type: NotRequired[InputFileType | str | None] - base_dir: NotRequired[str | None] - connection_string: NotRequired[str | None] - container_name: NotRequired[str | None] - file_encoding: NotRequired[str | None] - file_pattern: NotRequired[str | None] - source_column: NotRequired[str | None] - timestamp_column: NotRequired[str | None] - timestamp_format: NotRequired[str | None] - text_column: NotRequired[str | None] - title_column: NotRequired[str | None] - document_attribute_columns: NotRequired[list[str] | str | None] - storage_account_blob_url: NotRequired[str | None] diff --git a/graphrag/config/input_models/llm_config_input.py b/graphrag/config/input_models/llm_config_input.py deleted file mode 100644 index 35b3b342b4..0000000000 --- a/graphrag/config/input_models/llm_config_input.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from datashaper import AsyncType -from typing_extensions import NotRequired, TypedDict - -from graphrag.config.input_models.llm_parameters_input import LLMParametersInput -from graphrag.config.input_models.parallelization_parameters_input import ( - ParallelizationParametersInput, -) - - -class LLMConfigInput(TypedDict): - """Base class for LLM-configured steps.""" - - llm: NotRequired[LLMParametersInput | None] - parallelization: NotRequired[ParallelizationParametersInput | None] - async_mode: NotRequired[AsyncType | str | None] diff --git a/graphrag/config/input_models/llm_parameters_input.py b/graphrag/config/input_models/llm_parameters_input.py deleted file mode 100644 index b99d7e9b7b..0000000000 --- a/graphrag/config/input_models/llm_parameters_input.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""LLM Parameters model.""" - -from typing_extensions import NotRequired, TypedDict - -from graphrag.config.enums import LLMType - - -class LLMParametersInput(TypedDict): - """LLM Parameters model.""" - - api_key: NotRequired[str | None] - type: NotRequired[LLMType | str | None] - model: NotRequired[str | None] - max_tokens: NotRequired[int | str | None] - request_timeout: NotRequired[float | str | None] - api_base: NotRequired[str | None] - api_version: NotRequired[str | None] - organization: NotRequired[str | None] - proxy: NotRequired[str | None] - audience: NotRequired[str | None] - deployment_name: NotRequired[str | None] - model_supports_json: NotRequired[bool | str | None] - tokens_per_minute: NotRequired[int | str | None] - requests_per_minute: NotRequired[int | str | None] - max_retries: NotRequired[int | str | None] - max_retry_wait: NotRequired[float | str | None] - sleep_on_rate_limit_recommendation: NotRequired[bool | str | None] - concurrent_requests: NotRequired[int | str | None] diff --git a/graphrag/config/input_models/local_search_config_input.py b/graphrag/config/input_models/local_search_config_input.py deleted file mode 100644 index 23df40102a..0000000000 --- a/graphrag/config/input_models/local_search_config_input.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired, TypedDict - - -class LocalSearchConfigInput(TypedDict): - """The default configuration section for Cache.""" - - text_unit_prop: NotRequired[float | str | None] - community_prop: NotRequired[float | str | None] - conversation_history_max_turns: NotRequired[int | str | None] - top_k_entities: NotRequired[int | str | None] - top_k_relationships: NotRequired[int | str | None] - max_tokens: NotRequired[int | str | None] - llm_max_tokens: NotRequired[int | str | None] diff --git a/graphrag/config/input_models/parallelization_parameters_input.py b/graphrag/config/input_models/parallelization_parameters_input.py deleted file mode 100644 index e9204437b2..0000000000 --- a/graphrag/config/input_models/parallelization_parameters_input.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""LLM Parameters model.""" - -from typing_extensions import NotRequired, TypedDict - - -class ParallelizationParametersInput(TypedDict): - """LLM Parameters model.""" - - stagger: NotRequired[float | str | None] - num_threads: NotRequired[int | str | None] diff --git a/graphrag/config/input_models/reporting_config_input.py b/graphrag/config/input_models/reporting_config_input.py deleted file mode 100644 index a224f0b440..0000000000 --- a/graphrag/config/input_models/reporting_config_input.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired, TypedDict - -from graphrag.config.enums import ReportingType - - -class ReportingConfigInput(TypedDict): - """The default configuration section for Reporting.""" - - type: NotRequired[ReportingType | str | None] - base_dir: NotRequired[str | None] - connection_string: NotRequired[str | None] - container_name: NotRequired[str | None] - storage_account_blob_url: NotRequired[str | None] diff --git a/graphrag/config/input_models/snapshots_config_input.py b/graphrag/config/input_models/snapshots_config_input.py deleted file mode 100644 index 74f284f077..0000000000 --- a/graphrag/config/input_models/snapshots_config_input.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired, TypedDict - - -class SnapshotsConfigInput(TypedDict): - """Configuration section for snapshots.""" - - embeddings: NotRequired[bool | str | None] - graphml: NotRequired[bool | str | None] - transient: NotRequired[bool | str | None] diff --git a/graphrag/config/input_models/storage_config_input.py b/graphrag/config/input_models/storage_config_input.py deleted file mode 100644 index 4b0e85a2e5..0000000000 --- a/graphrag/config/input_models/storage_config_input.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired, TypedDict - -from graphrag.config.enums import StorageType - - -class StorageConfigInput(TypedDict): - """The default configuration section for Storage.""" - - type: NotRequired[StorageType | str | None] - base_dir: NotRequired[str | None] - connection_string: NotRequired[str | None] - container_name: NotRequired[str | None] - storage_account_blob_url: NotRequired[str | None] - cosmosdb_account_url: NotRequired[str | None] diff --git a/graphrag/config/input_models/summarize_descriptions_config_input.py b/graphrag/config/input_models/summarize_descriptions_config_input.py deleted file mode 100644 index b71a465aef..0000000000 --- a/graphrag/config/input_models/summarize_descriptions_config_input.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired - -from graphrag.config.input_models.llm_config_input import LLMConfigInput - - -class SummarizeDescriptionsConfigInput(LLMConfigInput): - """Configuration section for description summarization.""" - - prompt: NotRequired[str | None] - max_length: NotRequired[int | str | None] - strategy: NotRequired[dict | None] diff --git a/graphrag/config/input_models/text_embedding_config_input.py b/graphrag/config/input_models/text_embedding_config_input.py deleted file mode 100644 index de72612e34..0000000000 --- a/graphrag/config/input_models/text_embedding_config_input.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired - -from graphrag.config.enums import ( - TextEmbeddingTarget, -) -from graphrag.config.input_models.llm_config_input import LLMConfigInput - - -class TextEmbeddingConfigInput(LLMConfigInput): - """Configuration section for text embeddings.""" - - batch_size: NotRequired[int | str | None] - batch_max_tokens: NotRequired[int | str | None] - target: NotRequired[TextEmbeddingTarget | str | None] - skip: NotRequired[list[str] | str | None] - vector_store: NotRequired[dict | None] - strategy: NotRequired[dict | None] diff --git a/graphrag/config/input_models/umap_config_input.py b/graphrag/config/input_models/umap_config_input.py deleted file mode 100644 index 543ca385e0..0000000000 --- a/graphrag/config/input_models/umap_config_input.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Parameterization settings for the default configuration.""" - -from typing_extensions import NotRequired, TypedDict - - -class UmapConfigInput(TypedDict): - """Configuration section for UMAP.""" - - enabled: NotRequired[bool | str | None] diff --git a/graphrag/config/models/basic_search_config.py b/graphrag/config/models/basic_search_config.py new file mode 100644 index 0000000000..d31f453aae --- /dev/null +++ b/graphrag/config/models/basic_search_config.py @@ -0,0 +1,42 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Parameterization settings for the default configuration.""" + +from pydantic import BaseModel, Field + +import graphrag.config.defaults as defs + + +class BasicSearchConfig(BaseModel): + """The default configuration section for Cache.""" + + prompt: str | None = Field( + description="The basic search prompt to use.", default=None + ) + text_unit_prop: float = Field( + description="The text unit proportion.", + default=defs.BASIC_SEARCH_TEXT_UNIT_PROP, + ) + conversation_history_max_turns: int = Field( + description="The conversation history maximum turns.", + default=defs.BASIC_SEARCH_CONVERSATION_HISTORY_MAX_TURNS, + ) + temperature: float | None = Field( + description="The temperature to use for token generation.", + default=defs.BASIC_SEARCH_LLM_TEMPERATURE, + ) + top_p: float | None = Field( + description="The top-p value to use for token generation.", + default=defs.BASIC_SEARCH_LLM_TOP_P, + ) + n: int | None = Field( + description="The number of completions to generate.", + default=defs.BASIC_SEARCH_LLM_N, + ) + max_tokens: int = Field( + description="The maximum tokens.", default=defs.BASIC_SEARCH_MAX_TOKENS + ) + llm_max_tokens: int = Field( + description="The LLM maximum tokens.", default=defs.BASIC_SEARCH_LLM_MAX_TOKENS + ) diff --git a/graphrag/config/models/graph_rag_config.py b/graphrag/config/models/graph_rag_config.py index 9e56eaed47..77ad944053 100644 --- a/graphrag/config/models/graph_rag_config.py +++ b/graphrag/config/models/graph_rag_config.py @@ -7,6 +7,7 @@ from pydantic import Field import graphrag.config.defaults as defs +from graphrag.config.models.basic_search_config import BasicSearchConfig from graphrag.config.models.cache_config import CacheConfig from graphrag.config.models.chunking_config import ChunkingConfig from graphrag.config.models.claim_extraction_config import ClaimExtractionConfig @@ -146,6 +147,11 @@ def __str__(self): ) """The drift search configuration.""" + basic_search: BasicSearchConfig = Field( + description="The basic search configuration.", default=BasicSearchConfig() + ) + """The basic search configuration.""" + encoding_model: str = Field( description="The encoding model to use.", default=defs.ENCODING_MODEL ) diff --git a/graphrag/config/models/llm_config.py b/graphrag/config/models/llm_config.py index 3759bd949e..78459e8f1a 100644 --- a/graphrag/config/models/llm_config.py +++ b/graphrag/config/models/llm_config.py @@ -3,10 +3,10 @@ """Parameterization settings for the default configuration.""" -from datashaper import AsyncType from pydantic import BaseModel, Field import graphrag.config.defaults as defs +from graphrag.config.enums import AsyncType from graphrag.config.models.llm_parameters import LLMParameters from graphrag.config.models.parallelization_parameters import ParallelizationParameters diff --git a/graphrag/index/config/input.py b/graphrag/index/config/input.py index 5d4b08dfa8..f9dad568d5 100644 --- a/graphrag/index/config/input.py +++ b/graphrag/index/config/input.py @@ -10,7 +10,6 @@ from pydantic import BaseModel, Field from graphrag.config.enums import InputFileType, InputType -from graphrag.index.config.workflow import PipelineWorkflowStep T = TypeVar("T") @@ -56,11 +55,6 @@ class PipelineInputConfig(BaseModel, Generic[T]): ) """The optional file filter for the input files.""" - post_process: list[PipelineWorkflowStep] | None = Field( - description="The post processing steps for the input.", default=None - ) - """The post processing steps for the input.""" - encoding: str | None = Field( description="The encoding for the input files.", default=None ) diff --git a/graphrag/index/config/workflow.py b/graphrag/index/config/workflow.py index 30e77d504f..58f1e5fddf 100644 --- a/graphrag/index/config/workflow.py +++ b/graphrag/index/config/workflow.py @@ -9,9 +9,6 @@ from pydantic import BaseModel, Field -PipelineWorkflowStep = dict[str, Any] -"""Represent a step in a workflow.""" - PipelineWorkflowConfig = dict[str, Any] """Represent a configuration for a workflow.""" @@ -22,11 +19,6 @@ class PipelineWorkflowReference(BaseModel): name: str | None = Field(description="Name of the workflow.", default=None) """Name of the workflow.""" - steps: list[PipelineWorkflowStep] | None = Field( - description="The optional steps for the workflow.", default=None - ) - """The optional steps for the workflow.""" - config: PipelineWorkflowConfig | None = Field( description="The optional configuration for the workflow.", default=None ) diff --git a/graphrag/index/context.py b/graphrag/index/context.py index c45decd173..c9242783c9 100644 --- a/graphrag/index/context.py +++ b/graphrag/index/context.py @@ -37,10 +37,3 @@ class PipelineRunContext: "Long-term storage for pipeline verbs to use. Items written here will be written to the storage provider." cache: PipelineCache "Cache instance for reading previous LLM responses." - runtime_storage: PipelineStorage - "Runtime only storage for pipeline verbs to use. Items written here will only live in memory during the current run." - - -# TODO: For now, just has the same props available to it -VerbRunContext = PipelineRunContext -"""Provides the context for the current verb run.""" diff --git a/graphrag/index/create_pipeline_config.py b/graphrag/index/create_pipeline_config.py index 800174d066..75a213239e 100644 --- a/graphrag/index/create_pipeline_config.py +++ b/graphrag/index/create_pipeline_config.py @@ -48,7 +48,7 @@ from graphrag.index.config.workflow import ( PipelineWorkflowReference, ) -from graphrag.index.workflows.default_workflows import ( +from graphrag.index.workflows import ( compute_communities, create_base_text_units, create_final_communities, diff --git a/graphrag/index/exporter.py b/graphrag/index/exporter.py deleted file mode 100644 index 4910e87467..0000000000 --- a/graphrag/index/exporter.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""ParquetExporter module.""" - -import logging -import traceback - -import pandas as pd -from pyarrow.lib import ArrowInvalid, ArrowTypeError - -from graphrag.index.typing import ErrorHandlerFn -from graphrag.storage.pipeline_storage import PipelineStorage - -log = logging.getLogger(__name__) - - -class ParquetExporter: - """ParquetExporter class. - - A class that exports dataframe's to a storage destination in .parquet file format. - """ - - _storage: PipelineStorage - _on_error: ErrorHandlerFn - - def __init__( - self, - storage: PipelineStorage, - on_error: ErrorHandlerFn, - ): - """Create a new Parquet Table TableExporter.""" - self._storage = storage - self._on_error = on_error - - async def export(self, name: str, data: pd.DataFrame) -> None: - """Export dataframe to storage.""" - filename = f"{name}.parquet" - log.info("exporting parquet table %s", filename) - try: - await self._storage.set(filename, data.to_parquet()) - except ArrowTypeError as e: - log.exception("Error while exporting parquet table") - self._on_error( - e, - traceback.format_exc(), - None, - ) - except ArrowInvalid as e: - log.exception("Error while exporting parquet table") - self._on_error( - e, - traceback.format_exc(), - None, - ) diff --git a/graphrag/index/flows/__init__.py b/graphrag/index/flows/__init__.py index b09c865054..13b7827bb7 100644 --- a/graphrag/index/flows/__init__.py +++ b/graphrag/index/flows/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""Core workflows without DataShaper wrappings.""" +"""Core workflows functions without workflow/pipeline wrappings.""" diff --git a/graphrag/index/flows/create_base_text_units.py b/graphrag/index/flows/create_base_text_units.py index 63f8f62b6e..33dad0aebd 100644 --- a/graphrag/index/flows/create_base_text_units.py +++ b/graphrag/index/flows/create_base_text_units.py @@ -3,20 +3,15 @@ """All the steps to transform base text_units.""" -from dataclasses import dataclass -from typing import Any, cast +from typing import cast import pandas as pd -from datashaper import ( - FieldAggregateOperation, - Progress, - VerbCallbacks, - aggregate_operation_mapping, -) +from graphrag.callbacks.verb_callbacks import VerbCallbacks from graphrag.config.models.chunking_config import ChunkStrategyType from graphrag.index.operations.chunk_text.chunk_text import chunk_text from graphrag.index.utils.hashing import gen_sha512_hash +from graphrag.logger.progress import Progress def create_base_text_units( @@ -37,20 +32,16 @@ def create_base_text_units( callbacks.progress(Progress(percent=0)) - aggregated = _aggregate_df( - sort, - groupby=[*group_by_columns] if len(group_by_columns) > 0 else None, - aggregations=[ - { - "column": "text_with_ids", - "operation": "array_agg", - "to": "texts", - } - ], + aggregated = ( + ( + sort.groupby(group_by_columns, sort=False) + if len(group_by_columns) > 0 + else sort.groupby(lambda _x: True) + ) + .agg(texts=("text_with_ids", list)) + .reset_index() ) - callbacks.progress(Progress(percent=1)) - aggregated["chunks"] = chunk_text( aggregated, column="texts", @@ -81,57 +72,3 @@ def create_base_text_units( return cast( "pd.DataFrame", aggregated[aggregated["text"].notna()].reset_index(drop=True) ) - - -# TODO: would be nice to inline this completely in the main method with pandas -def _aggregate_df( - input: pd.DataFrame, - aggregations: list[dict[str, Any]], - groupby: list[str] | None = None, -) -> pd.DataFrame: - """Aggregate method definition.""" - aggregations_to_apply = _load_aggregations(aggregations) - df_aggregations = { - agg.column: _get_pandas_agg_operation(agg) - for agg in aggregations_to_apply.values() - } - if groupby is None: - output_grouped = input.groupby(lambda _x: True) - else: - output_grouped = input.groupby(groupby, sort=False) - output = cast("pd.DataFrame", output_grouped.agg(df_aggregations)) - output.rename( - columns={agg.column: agg.to for agg in aggregations_to_apply.values()}, - inplace=True, - ) - output.columns = [agg.to for agg in aggregations_to_apply.values()] - return output.reset_index() - - -@dataclass -class Aggregation: - """Aggregation class method definition.""" - - column: str | None - operation: str - to: str - - # Only useful for the concat operation - separator: str | None = None - - -def _get_pandas_agg_operation(agg: Aggregation) -> Any: - if agg.operation == "string_concat": - return (agg.separator or ",").join - return aggregate_operation_mapping[FieldAggregateOperation(agg.operation)] - - -def _load_aggregations( - aggregations: list[dict[str, Any]], -) -> dict[str, Aggregation]: - return { - aggregation["column"]: Aggregation( - aggregation["column"], aggregation["operation"], aggregation["to"] - ) - for aggregation in aggregations - } diff --git a/graphrag/index/flows/create_final_community_reports.py b/graphrag/index/flows/create_final_community_reports.py index 574945de9d..f94103db04 100644 --- a/graphrag/index/flows/create_final_community_reports.py +++ b/graphrag/index/flows/create_final_community_reports.py @@ -6,12 +6,10 @@ from uuid import uuid4 import pandas as pd -from datashaper import ( - AsyncType, - VerbCallbacks, -) from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.enums import AsyncType from graphrag.index.operations.summarize_communities import ( prepare_community_reports, restore_community_hierarchy, diff --git a/graphrag/index/flows/create_final_covariates.py b/graphrag/index/flows/create_final_covariates.py index f9b5f7e377..ce6cccaa9c 100644 --- a/graphrag/index/flows/create_final_covariates.py +++ b/graphrag/index/flows/create_final_covariates.py @@ -7,12 +7,10 @@ from uuid import uuid4 import pandas as pd -from datashaper import ( - AsyncType, - VerbCallbacks, -) from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.enums import AsyncType from graphrag.index.operations.extract_covariates.extract_covariates import ( extract_covariates, ) diff --git a/graphrag/index/flows/create_final_nodes.py b/graphrag/index/flows/create_final_nodes.py index 511ff429e7..f75ef2733a 100644 --- a/graphrag/index/flows/create_final_nodes.py +++ b/graphrag/index/flows/create_final_nodes.py @@ -4,10 +4,8 @@ """All the steps to transform final nodes.""" import pandas as pd -from datashaper import ( - VerbCallbacks, -) +from graphrag.callbacks.verb_callbacks import VerbCallbacks from graphrag.config.models.embed_graph_config import EmbedGraphConfig from graphrag.index.operations.compute_degree import compute_degree from graphrag.index.operations.create_graph import create_graph diff --git a/graphrag/index/flows/extract_graph.py b/graphrag/index/flows/extract_graph.py index 87e369f525..8eaa4d2951 100644 --- a/graphrag/index/flows/extract_graph.py +++ b/graphrag/index/flows/extract_graph.py @@ -7,12 +7,10 @@ from uuid import uuid4 import pandas as pd -from datashaper import ( - AsyncType, - VerbCallbacks, -) from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.enums import AsyncType from graphrag.index.operations.extract_entities import extract_entities from graphrag.index.operations.summarize_descriptions import ( summarize_descriptions, diff --git a/graphrag/index/flows/generate_text_embeddings.py b/graphrag/index/flows/generate_text_embeddings.py index 877966dab7..d8c547663d 100644 --- a/graphrag/index/flows/generate_text_embeddings.py +++ b/graphrag/index/flows/generate_text_embeddings.py @@ -6,11 +6,9 @@ import logging import pandas as pd -from datashaper import ( - VerbCallbacks, -) from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks from graphrag.index.config.embeddings import ( community_full_content_embedding, community_summary_embedding, @@ -22,8 +20,8 @@ text_unit_text_embedding, ) from graphrag.index.operations.embed_text import embed_text -from graphrag.index.operations.snapshot import snapshot from graphrag.storage.pipeline_storage import PipelineStorage +from graphrag.utils.storage import write_table_to_storage log = logging.getLogger(__name__) @@ -131,9 +129,4 @@ async def _run_and_snapshot_embeddings( if snapshot_embeddings_enabled is True: data = data.loc[:, ["id", "embedding"]] - await snapshot( - data, - name=f"embeddings.{name}", - storage=storage, - formats=["parquet"], - ) + await write_table_to_storage(data, f"embeddings.{name}", storage) diff --git a/graphrag/index/llm/load_llm.py b/graphrag/index/llm/load_llm.py index ecd91b4bca..eae2cf34bd 100644 --- a/graphrag/index/llm/load_llm.py +++ b/graphrag/index/llm/load_llm.py @@ -29,9 +29,8 @@ from .mock_llm import MockChatLLM if TYPE_CHECKING: - from datashaper import VerbCallbacks - from graphrag.cache.pipeline_cache import PipelineCache + from graphrag.callbacks.verb_callbacks import VerbCallbacks from graphrag.index.typing import ErrorHandlerFn log = logging.getLogger(__name__) diff --git a/graphrag/index/load_pipeline_config.py b/graphrag/index/load_pipeline_config.py deleted file mode 100644 index 77893b9535..0000000000 --- a/graphrag/index/load_pipeline_config.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing read_dotenv, load_pipeline_config, _parse_yaml and _create_include_constructor methods definition.""" - -import json -from pathlib import Path - -import yaml -from pyaml_env import parse_config as parse_config_with_env - -from graphrag.config.create_graphrag_config import create_graphrag_config, read_dotenv -from graphrag.index.config.pipeline import PipelineConfig -from graphrag.index.create_pipeline_config import create_pipeline_config - - -def load_pipeline_config(config_or_path: str | PipelineConfig) -> PipelineConfig: - """Load a pipeline config from a file path or a config object.""" - if isinstance(config_or_path, PipelineConfig): - config = config_or_path - elif config_or_path == "default": - config = create_pipeline_config(create_graphrag_config(root_dir=".")) - else: - # Is there a .env file in the same directory as the config? - read_dotenv(str(Path(config_or_path).parent)) - - if config_or_path.endswith(".json"): - with Path(config_or_path).open("rb") as f: - config = json.loads(f.read().decode(encoding="utf-8", errors="strict")) - elif config_or_path.endswith((".yml", ".yaml")): - config = _parse_yaml(config_or_path) - else: - msg = f"Invalid config file type: {config_or_path}" - raise ValueError(msg) - - config = PipelineConfig.model_validate(config) - if not config.root_dir: - config.root_dir = str(Path(config_or_path).parent.resolve()) - - if config.extends is not None: - if isinstance(config.extends, str): - config.extends = [config.extends] - for extended_config in config.extends: - extended_config = load_pipeline_config(extended_config) - merged_config = { - **json.loads(extended_config.model_dump_json()), - **json.loads(config.model_dump_json(exclude_unset=True)), - } - config = PipelineConfig.model_validate(merged_config) - - return config - - -def _parse_yaml(path: str): - """Parse a yaml file, with support for !include directives.""" - # I don't like that this is static - loader_class = yaml.SafeLoader - - # Add !include constructor if not already present. - if "!include" not in loader_class.yaml_constructors: - loader_class.add_constructor("!include", _create_include_constructor()) - - return parse_config_with_env(path, loader=loader_class, default_value="") - - -def _create_include_constructor(): - """Create a constructor for !include directives.""" - - def handle_include(loader: yaml.Loader, node: yaml.Node): - """Include file referenced at node.""" - filename = str(Path(loader.name).parent / node.value) - if filename.endswith((".yml", ".yaml")): - return _parse_yaml(filename) - - with Path(filename).open("rb") as f: - return f.read().decode(encoding="utf-8", errors="strict") - - return handle_include diff --git a/graphrag/index/operations/chunk_text/chunk_text.py b/graphrag/index/operations/chunk_text/chunk_text.py index 554cfbda35..02c12e6f1a 100644 --- a/graphrag/index/operations/chunk_text/chunk_text.py +++ b/graphrag/index/operations/chunk_text/chunk_text.py @@ -6,17 +6,14 @@ from typing import Any, cast import pandas as pd -from datashaper import ( - ProgressTicker, - VerbCallbacks, - progress_ticker, -) +from graphrag.callbacks.verb_callbacks import VerbCallbacks from graphrag.config.models.chunking_config import ChunkingConfig, ChunkStrategyType from graphrag.index.operations.chunk_text.typing import ( ChunkInput, ChunkStrategy, ) +from graphrag.logger.progress import ProgressTicker, progress_ticker def chunk_text( diff --git a/graphrag/index/operations/chunk_text/strategies.py b/graphrag/index/operations/chunk_text/strategies.py index 1468028537..3fc8fc6f2f 100644 --- a/graphrag/index/operations/chunk_text/strategies.py +++ b/graphrag/index/operations/chunk_text/strategies.py @@ -7,11 +7,11 @@ import nltk import tiktoken -from datashaper import ProgressTicker from graphrag.config.models.chunking_config import ChunkingConfig from graphrag.index.operations.chunk_text.typing import TextChunk from graphrag.index.text_splitting.text_splitting import Tokenizer +from graphrag.logger.progress import ProgressTicker def run_tokens( diff --git a/graphrag/index/operations/chunk_text/typing.py b/graphrag/index/operations/chunk_text/typing.py index 5f0994ec05..bf58ef5ec1 100644 --- a/graphrag/index/operations/chunk_text/typing.py +++ b/graphrag/index/operations/chunk_text/typing.py @@ -6,9 +6,8 @@ from collections.abc import Callable, Iterable from dataclasses import dataclass -from datashaper import ProgressTicker - from graphrag.config.models.chunking_config import ChunkingConfig +from graphrag.logger.progress import ProgressTicker @dataclass diff --git a/graphrag/index/operations/embed_text/embed_text.py b/graphrag/index/operations/embed_text/embed_text.py index f335802c5f..f4a7e5f367 100644 --- a/graphrag/index/operations/embed_text/embed_text.py +++ b/graphrag/index/operations/embed_text/embed_text.py @@ -9,9 +9,9 @@ import numpy as np import pandas as pd -from datashaper import VerbCallbacks from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks from graphrag.index.operations.embed_text.strategies.typing import TextEmbeddingStrategy from graphrag.utils.embeddings import create_collection_name from graphrag.vector_stores.base import BaseVectorStore, VectorStoreDocument diff --git a/graphrag/index/operations/embed_text/strategies/mock.py b/graphrag/index/operations/embed_text/strategies/mock.py index 3ebb1de8a2..9facd66643 100644 --- a/graphrag/index/operations/embed_text/strategies/mock.py +++ b/graphrag/index/operations/embed_text/strategies/mock.py @@ -7,10 +7,10 @@ from collections.abc import Iterable from typing import Any -from datashaper import ProgressTicker, VerbCallbacks, progress_ticker - from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks from graphrag.index.operations.embed_text.strategies.typing import TextEmbeddingResult +from graphrag.logger.progress import ProgressTicker, progress_ticker async def run( # noqa RUF029 async is required for interface diff --git a/graphrag/index/operations/embed_text/strategies/openai.py b/graphrag/index/operations/embed_text/strategies/openai.py index 36be774203..5bef604dab 100644 --- a/graphrag/index/operations/embed_text/strategies/openai.py +++ b/graphrag/index/operations/embed_text/strategies/openai.py @@ -8,17 +8,18 @@ from typing import Any import numpy as np -from datashaper import ProgressTicker, VerbCallbacks, progress_ticker from fnllm import EmbeddingsLLM from pydantic import TypeAdapter import graphrag.config.defaults as defs from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks from graphrag.config.models.llm_parameters import LLMParameters from graphrag.index.llm.load_llm import load_llm_embeddings from graphrag.index.operations.embed_text.strategies.typing import TextEmbeddingResult from graphrag.index.text_splitting.text_splitting import TokenTextSplitter from graphrag.index.utils.is_null import is_null +from graphrag.logger.progress import ProgressTicker, progress_ticker log = logging.getLogger(__name__) diff --git a/graphrag/index/operations/embed_text/strategies/typing.py b/graphrag/index/operations/embed_text/strategies/typing.py index b53d710c0b..5962045a67 100644 --- a/graphrag/index/operations/embed_text/strategies/typing.py +++ b/graphrag/index/operations/embed_text/strategies/typing.py @@ -6,9 +6,8 @@ from collections.abc import Awaitable, Callable from dataclasses import dataclass -from datashaper import VerbCallbacks - from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks @dataclass diff --git a/graphrag/index/operations/extract_covariates/claim_extractor.py b/graphrag/index/operations/extract_covariates/claim_extractor.py index e5fb6c3b40..a8758df0bf 100644 --- a/graphrag/index/operations/extract_covariates/claim_extractor.py +++ b/graphrag/index/operations/extract_covariates/claim_extractor.py @@ -88,8 +88,8 @@ def __init__( # Construct the looping arguments encoding = tiktoken.get_encoding(encoding_model or defs.ENCODING_MODEL) - yes = f"{encoding.encode('YES')[0]}" - no = f"{encoding.encode('NO')[0]}" + yes = f"{encoding.encode('Y')[0]}" + no = f"{encoding.encode('N')[0]}" self._loop_args = {"logit_bias": {yes: 100, no: 100}, "max_tokens": 1} async def __call__( @@ -195,7 +195,7 @@ async def _process_document( history=response.history, model_parameters=self._loop_args, ) - if response.output.content != "YES": + if response.output.content != "Y": break return self._parse_claim_tuples(results, prompt_args) diff --git a/graphrag/index/operations/extract_covariates/extract_covariates.py b/graphrag/index/operations/extract_covariates/extract_covariates.py index 5dab42b8df..323d95627d 100644 --- a/graphrag/index/operations/extract_covariates/extract_covariates.py +++ b/graphrag/index/operations/extract_covariates/extract_covariates.py @@ -9,20 +9,18 @@ from typing import Any import pandas as pd -from datashaper import ( - AsyncType, - VerbCallbacks, - derive_from_rows, -) import graphrag.config.defaults as defs from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.enums import AsyncType from graphrag.index.llm.load_llm import load_llm, read_llm_params from graphrag.index.operations.extract_covariates.claim_extractor import ClaimExtractor from graphrag.index.operations.extract_covariates.typing import ( Covariate, CovariateExtractionResult, ) +from graphrag.index.run.derive_from_rows import derive_from_rows log = logging.getLogger(__name__) @@ -65,7 +63,7 @@ async def run_strategy(row): input, run_strategy, callbacks, - scheduling_type=async_mode, + async_type=async_mode, num_threads=num_threads, ) return pd.DataFrame([item for row in results for item in row or []]) diff --git a/graphrag/index/operations/extract_covariates/typing.py b/graphrag/index/operations/extract_covariates/typing.py index f5c7e0a02e..8f95b9b5fb 100644 --- a/graphrag/index/operations/extract_covariates/typing.py +++ b/graphrag/index/operations/extract_covariates/typing.py @@ -7,9 +7,8 @@ from dataclasses import dataclass from typing import Any -from datashaper import VerbCallbacks - from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks @dataclass diff --git a/graphrag/index/operations/extract_entities/extract_entities.py b/graphrag/index/operations/extract_entities/extract_entities.py index e3b7410d06..d50e1219b3 100644 --- a/graphrag/index/operations/extract_entities/extract_entities.py +++ b/graphrag/index/operations/extract_entities/extract_entities.py @@ -7,19 +7,17 @@ from typing import Any import pandas as pd -from datashaper import ( - AsyncType, - VerbCallbacks, - derive_from_rows, -) from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.enums import AsyncType from graphrag.index.bootstrap import bootstrap from graphrag.index.operations.extract_entities.typing import ( Document, EntityExtractStrategy, ExtractEntityStrategyType, ) +from graphrag.index.run.derive_from_rows import derive_from_rows log = logging.getLogger(__name__) @@ -124,7 +122,7 @@ async def run_strategy(row): text_units, run_strategy, callbacks, - scheduling_type=async_mode, + async_type=async_mode, num_threads=num_threads, ) diff --git a/graphrag/index/operations/extract_entities/graph_extractor.py b/graphrag/index/operations/extract_entities/graph_extractor.py index 1a2ce19695..f10b2c83e5 100644 --- a/graphrag/index/operations/extract_entities/graph_extractor.py +++ b/graphrag/index/operations/extract_entities/graph_extractor.py @@ -92,8 +92,8 @@ def __init__( # Construct the looping arguments encoding = tiktoken.get_encoding(encoding_model or defs.ENCODING_MODEL) - yes = f"{encoding.encode('YES')[0]}" - no = f"{encoding.encode('NO')[0]}" + yes = f"{encoding.encode('Y')[0]}" + no = f"{encoding.encode('N')[0]}" self._loop_args = {"logit_bias": {yes: 100, no: 100}, "max_tokens": 1} async def __call__( @@ -180,7 +180,7 @@ async def _process_document( model_parameters=self._loop_args, ) - if response.output.content != "YES": + if response.output.content != "Y": break return results diff --git a/graphrag/index/operations/extract_entities/graph_intelligence_strategy.py b/graphrag/index/operations/extract_entities/graph_intelligence_strategy.py index 9084321621..2a403112a1 100644 --- a/graphrag/index/operations/extract_entities/graph_intelligence_strategy.py +++ b/graphrag/index/operations/extract_entities/graph_intelligence_strategy.py @@ -4,11 +4,11 @@ """A module containing run_graph_intelligence, run_extract_entities and _create_text_splitter methods to run graph intelligence.""" import networkx as nx -from datashaper import VerbCallbacks from fnllm import ChatLLM import graphrag.config.defaults as defs from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks from graphrag.index.llm.load_llm import load_llm, read_llm_params from graphrag.index.operations.extract_entities.graph_extractor import GraphExtractor from graphrag.index.operations.extract_entities.typing import ( diff --git a/graphrag/index/operations/extract_entities/nltk_strategy.py b/graphrag/index/operations/extract_entities/nltk_strategy.py index 81103c6955..e133aeeab4 100644 --- a/graphrag/index/operations/extract_entities/nltk_strategy.py +++ b/graphrag/index/operations/extract_entities/nltk_strategy.py @@ -5,10 +5,10 @@ import networkx as nx import nltk -from datashaper import VerbCallbacks from nltk.corpus import words from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks from graphrag.index.operations.extract_entities.typing import ( Document, EntityExtractionResult, diff --git a/graphrag/index/operations/extract_entities/typing.py b/graphrag/index/operations/extract_entities/typing.py index 7eb2440674..247c781003 100644 --- a/graphrag/index/operations/extract_entities/typing.py +++ b/graphrag/index/operations/extract_entities/typing.py @@ -9,9 +9,9 @@ from typing import Any import networkx as nx -from datashaper import VerbCallbacks from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks ExtractedEntity = dict[str, Any] ExtractedRelationship = dict[str, Any] diff --git a/graphrag/index/operations/layout_graph/layout_graph.py b/graphrag/index/operations/layout_graph/layout_graph.py index a4c7471292..b96ef91e34 100644 --- a/graphrag/index/operations/layout_graph/layout_graph.py +++ b/graphrag/index/operations/layout_graph/layout_graph.py @@ -5,8 +5,8 @@ import networkx as nx import pandas as pd -from datashaper import VerbCallbacks +from graphrag.callbacks.verb_callbacks import VerbCallbacks from graphrag.index.operations.embed_graph.typing import NodeEmbeddings from graphrag.index.operations.layout_graph.typing import GraphLayout diff --git a/graphrag/index/operations/snapshot.py b/graphrag/index/operations/snapshot.py deleted file mode 100644 index 1a61fce1cd..0000000000 --- a/graphrag/index/operations/snapshot.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing snapshot method definition.""" - -import pandas as pd - -from graphrag.storage.pipeline_storage import PipelineStorage - - -async def snapshot( - input: pd.DataFrame, - name: str, - formats: list[str], - storage: PipelineStorage, -) -> None: - """Take a entire snapshot of the tabular data.""" - for fmt in formats: - if fmt == "parquet": - await storage.set(f"{name}.parquet", input.to_parquet()) - elif fmt == "json": - await storage.set( - f"{name}.json", input.to_json(orient="records", lines=True) - ) diff --git a/graphrag/index/operations/snapshot_graphml.py b/graphrag/index/operations/snapshot_graphml.py index 6d1d488494..c1eb9b0688 100644 --- a/graphrag/index/operations/snapshot_graphml.py +++ b/graphrag/index/operations/snapshot_graphml.py @@ -1,7 +1,7 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""A module containing snapshot method definition.""" +"""A module containing snapshot_graphml method definition.""" import networkx as nx diff --git a/graphrag/index/operations/summarize_communities/prepare_community_reports.py b/graphrag/index/operations/summarize_communities/prepare_community_reports.py index 45a6fec6d8..66fcaa2bb5 100644 --- a/graphrag/index/operations/summarize_communities/prepare_community_reports.py +++ b/graphrag/index/operations/summarize_communities/prepare_community_reports.py @@ -6,18 +6,16 @@ import logging import pandas as pd -from datashaper import ( - VerbCallbacks, - progress_iterable, -) import graphrag.index.operations.summarize_communities.community_reports_extractor.schemas as schemas +from graphrag.callbacks.verb_callbacks import VerbCallbacks from graphrag.index.operations.summarize_communities.community_reports_extractor.sort_context import ( parallel_sort_context_batch, ) from graphrag.index.operations.summarize_communities.community_reports_extractor.utils import ( get_levels, ) +from graphrag.logger.progress import progress_iterable log = logging.getLogger(__name__) diff --git a/graphrag/index/operations/summarize_communities/strategies.py b/graphrag/index/operations/summarize_communities/strategies.py index 9003e777bf..e630baba73 100644 --- a/graphrag/index/operations/summarize_communities/strategies.py +++ b/graphrag/index/operations/summarize_communities/strategies.py @@ -6,10 +6,10 @@ import logging import traceback -from datashaper import VerbCallbacks from fnllm import ChatLLM from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks from graphrag.index.llm.load_llm import load_llm, read_llm_params from graphrag.index.operations.summarize_communities.community_reports_extractor.community_reports_extractor import ( CommunityReportsExtractor, diff --git a/graphrag/index/operations/summarize_communities/summarize_communities.py b/graphrag/index/operations/summarize_communities/summarize_communities.py index d4c5c01072..df6dd631e1 100644 --- a/graphrag/index/operations/summarize_communities/summarize_communities.py +++ b/graphrag/index/operations/summarize_communities/summarize_communities.py @@ -6,17 +6,13 @@ import logging import pandas as pd -from datashaper import ( - AsyncType, - NoopVerbCallbacks, - VerbCallbacks, - derive_from_rows, - progress_ticker, -) import graphrag.config.defaults as defaults import graphrag.index.operations.summarize_communities.community_reports_extractor.schemas as schemas from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.enums import AsyncType from graphrag.index.operations.summarize_communities.community_reports_extractor import ( prep_community_report_context, ) @@ -28,6 +24,8 @@ CommunityReportsStrategy, CreateCommunityReportsStrategyType, ) +from graphrag.index.run.derive_from_rows import derive_from_rows +from graphrag.logger.progress import progress_ticker log = logging.getLogger(__name__) @@ -77,7 +75,7 @@ async def run_generate(record): run_generate, callbacks=NoopVerbCallbacks(), num_threads=num_threads, - scheduling_type=async_mode, + async_type=async_mode, ) reports.extend([lr for lr in local_reports if lr is not None]) diff --git a/graphrag/index/operations/summarize_communities/typing.py b/graphrag/index/operations/summarize_communities/typing.py index 6c6b7e6773..2a1ed3aca5 100644 --- a/graphrag/index/operations/summarize_communities/typing.py +++ b/graphrag/index/operations/summarize_communities/typing.py @@ -7,10 +7,10 @@ from enum import Enum from typing import Any -from datashaper import VerbCallbacks from typing_extensions import TypedDict from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks ExtractedEntity = dict[str, Any] StrategyConfig = dict[str, Any] diff --git a/graphrag/index/operations/summarize_descriptions/graph_intelligence_strategy.py b/graphrag/index/operations/summarize_descriptions/graph_intelligence_strategy.py index 4a22b9b554..e5de39f57f 100644 --- a/graphrag/index/operations/summarize_descriptions/graph_intelligence_strategy.py +++ b/graphrag/index/operations/summarize_descriptions/graph_intelligence_strategy.py @@ -3,10 +3,10 @@ """A module containing run_graph_intelligence, run_resolve_entities and _create_text_list_splitter methods to run graph intelligence.""" -from datashaper import VerbCallbacks from fnllm import ChatLLM from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks from graphrag.index.llm.load_llm import load_llm, read_llm_params from graphrag.index.operations.summarize_descriptions.description_summary_extractor import ( SummarizeExtractor, diff --git a/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py b/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py index cf6650dd08..d1ad4af487 100644 --- a/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py +++ b/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py @@ -8,17 +8,14 @@ from typing import Any import pandas as pd -from datashaper import ( - ProgressTicker, - VerbCallbacks, - progress_ticker, -) from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks from graphrag.index.operations.summarize_descriptions.typing import ( SummarizationStrategy, SummarizeStrategyType, ) +from graphrag.logger.progress import ProgressTicker, progress_ticker log = logging.getLogger(__name__) diff --git a/graphrag/index/operations/summarize_descriptions/typing.py b/graphrag/index/operations/summarize_descriptions/typing.py index ca0ee13626..919ff9fd1c 100644 --- a/graphrag/index/operations/summarize_descriptions/typing.py +++ b/graphrag/index/operations/summarize_descriptions/typing.py @@ -8,9 +8,8 @@ from enum import Enum from typing import Any, NamedTuple -from datashaper import VerbCallbacks - from graphrag.cache.pipeline_cache import PipelineCache +from graphrag.callbacks.verb_callbacks import VerbCallbacks StrategyConfig = dict[str, Any] diff --git a/graphrag/index/run/__init__.py b/graphrag/index/run/__init__.py index afb43acd8e..d5e41d66a5 100644 --- a/graphrag/index/run/__init__.py +++ b/graphrag/index/run/__init__.py @@ -2,7 +2,3 @@ # Licensed under the MIT License """Run module for GraphRAG.""" - -from graphrag.index.run.run import run_pipeline, run_pipeline_with_config - -__all__ = ["run_pipeline", "run_pipeline_with_config"] diff --git a/graphrag/index/run/derive_from_rows.py b/graphrag/index/run/derive_from_rows.py new file mode 100644 index 0000000000..283621bb93 --- /dev/null +++ b/graphrag/index/run/derive_from_rows.py @@ -0,0 +1,158 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Apply a generic transform function to each row in a table.""" + +import asyncio +import inspect +import logging +import traceback +from collections.abc import Awaitable, Callable, Coroutine, Hashable +from typing import Any, TypeVar, cast + +import pandas as pd + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.enums import AsyncType +from graphrag.logger.progress import progress_ticker + +logger = logging.getLogger(__name__) +ItemType = TypeVar("ItemType") + + +class ParallelizationError(ValueError): + """Exception for invalid parallel processing.""" + + def __init__(self, num_errors: int): + super().__init__( + f"{num_errors} Errors occurred while running parallel transformation, could not complete!" + ) + + +async def derive_from_rows( + input: pd.DataFrame, + transform: Callable[[pd.Series], Awaitable[ItemType]], + callbacks: VerbCallbacks, + num_threads: int = 4, + async_type: AsyncType = AsyncType.AsyncIO, +) -> list[ItemType | None]: + """Apply a generic transform function to each row. Any errors will be reported and thrown.""" + match async_type: + case AsyncType.AsyncIO: + return await derive_from_rows_asyncio( + input, transform, callbacks, num_threads + ) + case AsyncType.Threaded: + return await derive_from_rows_asyncio_threads( + input, transform, callbacks, num_threads + ) + case _: + msg = f"Unsupported scheduling type {async_type}" + raise ValueError(msg) + + +"""A module containing the derive_from_rows_async method.""" + + +async def derive_from_rows_asyncio_threads( + input: pd.DataFrame, + transform: Callable[[pd.Series], Awaitable[ItemType]], + callbacks: VerbCallbacks, + num_threads: int | None = 4, +) -> list[ItemType | None]: + """ + Derive from rows asynchronously. + + This is useful for IO bound operations. + """ + semaphore = asyncio.Semaphore(num_threads or 4) + + async def gather(execute: ExecuteFn[ItemType]) -> list[ItemType | None]: + tasks = [asyncio.to_thread(execute, row) for row in input.iterrows()] + + async def execute_task(task: Coroutine) -> ItemType | None: + async with semaphore: + # fire off the thread + thread = await task + return await thread + + return await asyncio.gather(*[execute_task(task) for task in tasks]) + + return await _derive_from_rows_base(input, transform, callbacks, gather) + + +"""A module containing the derive_from_rows_async method.""" + + +async def derive_from_rows_asyncio( + input: pd.DataFrame, + transform: Callable[[pd.Series], Awaitable[ItemType]], + callbacks: VerbCallbacks, + num_threads: int = 4, +) -> list[ItemType | None]: + """ + Derive from rows asynchronously. + + This is useful for IO bound operations. + """ + semaphore = asyncio.Semaphore(num_threads or 4) + + async def gather(execute: ExecuteFn[ItemType]) -> list[ItemType | None]: + async def execute_row_protected( + row: tuple[Hashable, pd.Series], + ) -> ItemType | None: + async with semaphore: + return await execute(row) + + tasks = [ + asyncio.create_task(execute_row_protected(row)) for row in input.iterrows() + ] + return await asyncio.gather(*tasks) + + return await _derive_from_rows_base(input, transform, callbacks, gather) + + +ItemType = TypeVar("ItemType") + +ExecuteFn = Callable[[tuple[Hashable, pd.Series]], Awaitable[ItemType | None]] +GatherFn = Callable[[ExecuteFn], Awaitable[list[ItemType | None]]] + + +async def _derive_from_rows_base( + input: pd.DataFrame, + transform: Callable[[pd.Series], Awaitable[ItemType]], + callbacks: VerbCallbacks, + gather: GatherFn[ItemType], +) -> list[ItemType | None]: + """ + Derive from rows asynchronously. + + This is useful for IO bound operations. + """ + tick = progress_ticker(callbacks.progress, num_total=len(input)) + errors: list[tuple[BaseException, str]] = [] + + async def execute(row: tuple[Any, pd.Series]) -> ItemType | None: + try: + result = transform(row[1]) + if inspect.iscoroutine(result): + result = await result + except Exception as e: # noqa: BLE001 + errors.append((e, traceback.format_exc())) + return None + else: + return cast("ItemType", result) + finally: + tick(1) + + result = await gather(execute) + + tick.done() + + for error, stack in errors: + callbacks.error("parallel transformation error", error, stack) + + if len(errors) > 0: + raise ParallelizationError(len(errors)) + + return result diff --git a/graphrag/index/run/postprocess.py b/graphrag/index/run/postprocess.py deleted file mode 100644 index 52c20064ad..0000000000 --- a/graphrag/index/run/postprocess.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Post Processing functions for the GraphRAG run module.""" - -from typing import cast - -import pandas as pd -from datashaper import DEFAULT_INPUT_NAME, WorkflowCallbacks - -from graphrag.index.config.input import PipelineInputConfigTypes -from graphrag.index.config.workflow import PipelineWorkflowStep -from graphrag.index.context import PipelineRunContext -from graphrag.index.workflows.load import create_workflow - - -def _create_postprocess_steps( - config: PipelineInputConfigTypes | None, -) -> list[PipelineWorkflowStep] | None: - """Retrieve the post process steps for the pipeline.""" - return config.post_process if config is not None else None - - -async def _run_post_process_steps( - post_process: list[PipelineWorkflowStep] | None, - dataset: pd.DataFrame, - context: PipelineRunContext, - callbacks: WorkflowCallbacks, -) -> pd.DataFrame: - """Run the pipeline. - - Args: - - post_process - The post process steps to run - - dataset - The dataset to run the steps on - - context - The pipeline run context - Returns: - - output - The dataset after running the post process steps - """ - if post_process: - input_workflow = create_workflow( - "Input Post Process", - post_process, - ) - input_workflow.add_table(DEFAULT_INPUT_NAME, dataset) - await input_workflow.run( - context=context, - callbacks=callbacks, - ) - dataset = cast("pd.DataFrame", input_workflow.output()) - return dataset diff --git a/graphrag/index/run/profiling.py b/graphrag/index/run/profiling.py deleted file mode 100644 index 36efcde019..0000000000 --- a/graphrag/index/run/profiling.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Profiling functions for the GraphRAG run module.""" - -import json -import logging -import time -from dataclasses import asdict - -from datashaper import MemoryProfile, Workflow, WorkflowRunResult - -from graphrag.index.context import PipelineRunStats -from graphrag.storage.pipeline_storage import PipelineStorage - -log = logging.getLogger(__name__) - - -async def _save_profiler_stats( - storage: PipelineStorage, workflow_name: str, profile: MemoryProfile -): - """Save the profiler stats to the storage.""" - await storage.set( - f"{workflow_name}_profiling.peak_stats.csv", - profile.peak_stats.to_csv(index=True), - ) - - await storage.set( - f"{workflow_name}_profiling.snapshot_stats.csv", - profile.snapshot_stats.to_csv(index=True), - ) - - await storage.set( - f"{workflow_name}_profiling.time_stats.csv", - profile.time_stats.to_csv(index=True), - ) - - await storage.set( - f"{workflow_name}_profiling.detailed_view.csv", - profile.detailed_view.to_csv(index=True), - ) - - -async def _dump_stats(stats: PipelineRunStats, storage: PipelineStorage) -> None: - """Dump the stats to the storage.""" - await storage.set( - "stats.json", json.dumps(asdict(stats), indent=4, ensure_ascii=False) - ) - - -async def _write_workflow_stats( - workflow: Workflow, - workflow_result: WorkflowRunResult, - workflow_start_time: float, - start_time: float, - stats: PipelineRunStats, - storage: PipelineStorage, -) -> None: - """Write the workflow stats to the storage.""" - for vt in workflow_result.verb_timings: - stats.workflows[workflow.name][f"{vt.index}_{vt.verb}"] = vt.timing - - workflow_end_time = time.time() - stats.workflows[workflow.name]["overall"] = workflow_end_time - workflow_start_time - stats.total_runtime = time.time() - start_time - await _dump_stats(stats, storage) - - if workflow_result.memory_profile is not None: - await _save_profiler_stats( - storage, workflow.name, workflow_result.memory_profile - ) diff --git a/graphrag/index/run/run.py b/graphrag/index/run/run.py deleted file mode 100644 index c050c6664d..0000000000 --- a/graphrag/index/run/run.py +++ /dev/null @@ -1,282 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Different methods to run the pipeline.""" - -import gc -import logging -import time -import traceback -from collections.abc import AsyncIterable -from typing import cast - -import pandas as pd -from datashaper import NoopVerbCallbacks, WorkflowCallbacks - -from graphrag.cache.factory import CacheFactory -from graphrag.cache.pipeline_cache import PipelineCache -from graphrag.callbacks.console_workflow_callbacks import ConsoleWorkflowCallbacks -from graphrag.index.config.pipeline import ( - PipelineConfig, - PipelineWorkflowReference, -) -from graphrag.index.config.workflow import PipelineWorkflowStep -from graphrag.index.exporter import ParquetExporter -from graphrag.index.input.factory import create_input -from graphrag.index.load_pipeline_config import load_pipeline_config -from graphrag.index.run.postprocess import ( - _create_postprocess_steps, - _run_post_process_steps, -) -from graphrag.index.run.profiling import _dump_stats -from graphrag.index.run.utils import ( - _apply_substitutions, - _validate_dataset, - create_run_context, -) -from graphrag.index.run.workflow import ( - _process_workflow, - create_callback_chain, -) -from graphrag.index.typing import PipelineRunResult -from graphrag.index.update.incremental_index import ( - get_delta_docs, - update_dataframe_outputs, -) -from graphrag.index.workflows import ( - VerbDefinitions, - WorkflowDefinitions, - load_workflows, -) -from graphrag.logger.base import ProgressLogger -from graphrag.logger.null_progress import NullProgressLogger -from graphrag.storage.factory import StorageFactory -from graphrag.storage.pipeline_storage import PipelineStorage - -log = logging.getLogger(__name__) - - -async def run_pipeline_with_config( - config_or_path: PipelineConfig | str, - workflows: list[PipelineWorkflowReference] | None = None, - dataset: pd.DataFrame | None = None, - storage: PipelineStorage | None = None, - update_index_storage: PipelineStorage | None = None, - cache: PipelineCache | None = None, - callbacks: list[WorkflowCallbacks] | None = None, - logger: ProgressLogger | None = None, - input_post_process_steps: list[PipelineWorkflowStep] | None = None, - additional_verbs: VerbDefinitions | None = None, - additional_workflows: WorkflowDefinitions | None = None, - memory_profile: bool = False, - run_id: str | None = None, - is_resume_run: bool = False, - is_update_run: bool = False, - **_kwargs: dict, -) -> AsyncIterable[PipelineRunResult]: - """Run a pipeline with the given config. - - Args: - - config_or_path - The config to run the pipeline with - - workflows - The workflows to run (this overrides the config) - - dataset - The dataset to run the pipeline on (this overrides the config) - - storage - The storage to use for the pipeline (this overrides the config) - - cache - The cache to use for the pipeline (this overrides the config) - - logger - The logger to use for the pipeline (this overrides the config) - - input_post_process_steps - The post process steps to run on the input data (this overrides the config) - - additional_verbs - The custom verbs to use for the pipeline. - - additional_workflows - The custom workflows to use for the pipeline. - - memory_profile - Whether or not to profile the memory. - - run_id - The run id to start or resume from. - """ - if isinstance(config_or_path, str): - log.info("Running pipeline with config %s", config_or_path) - else: - log.info("Running pipeline") - - run_id = run_id or time.strftime("%Y%m%d-%H%M%S") - config = load_pipeline_config(config_or_path) - config = _apply_substitutions(config, run_id) - root_dir = config.root_dir or "" - - progress_logger = logger or NullProgressLogger() - storage_config = config.storage.model_dump() # type: ignore - storage = storage or StorageFactory().create_storage( - storage_type=storage_config["type"], # type: ignore - kwargs=storage_config, - ) - - if is_update_run: - update_storage_config = config.update_index_storage.model_dump() # type: ignore - update_index_storage = update_index_storage or StorageFactory().create_storage( - storage_type=update_storage_config["type"], # type: ignore - kwargs=update_storage_config, - ) - - # TODO: remove the type ignore when the new config system guarantees the existence of a cache config - cache_config = config.cache.model_dump() # type: ignore - cache = cache or CacheFactory().create_cache( - cache_type=cache_config["type"], # type: ignore - root_dir=root_dir, - kwargs=cache_config, - ) - # TODO: remove the type ignore when the new config system guarantees the existence of an input config - dataset = ( - dataset - if dataset is not None - else await create_input(config.input, progress_logger, root_dir) # type: ignore - ) - - post_process_steps = input_post_process_steps or _create_postprocess_steps( - config.input - ) - workflows = workflows or config.workflows - - if is_update_run and update_index_storage: - delta_dataset = await get_delta_docs(dataset, storage) - - # Fail on empty delta dataset - if delta_dataset.new_inputs.empty: - error_msg = "Incremental Indexing Error: No new documents to process." - raise ValueError(error_msg) - - delta_storage = update_index_storage.child("delta") - - # Run the pipeline on the new documents - tables_dict = {} - async for table in run_pipeline( - workflows=workflows, - dataset=delta_dataset.new_inputs, - storage=delta_storage, - cache=cache, - callbacks=callbacks, - input_post_process_steps=post_process_steps, - memory_profile=memory_profile, - additional_verbs=additional_verbs, - additional_workflows=additional_workflows, - progress_logger=progress_logger, - is_resume_run=False, - ): - tables_dict[table.workflow] = table.result - - progress_logger.success("Finished running workflows on new documents.") - await update_dataframe_outputs( - dataframe_dict=tables_dict, - storage=storage, - update_storage=update_index_storage, - config=config, - cache=cache, - callbacks=NoopVerbCallbacks(), - progress_logger=progress_logger, - ) - - else: - async for table in run_pipeline( - workflows=workflows, - dataset=dataset, - storage=storage, - cache=cache, - callbacks=callbacks, - input_post_process_steps=post_process_steps, - memory_profile=memory_profile, - additional_verbs=additional_verbs, - additional_workflows=additional_workflows, - progress_logger=progress_logger, - is_resume_run=is_resume_run, - ): - yield table - - -async def run_pipeline( - workflows: list[PipelineWorkflowReference], - dataset: pd.DataFrame, - storage: PipelineStorage | None = None, - cache: PipelineCache | None = None, - callbacks: list[WorkflowCallbacks] | None = None, - progress_logger: ProgressLogger | None = None, - input_post_process_steps: list[PipelineWorkflowStep] | None = None, - additional_verbs: VerbDefinitions | None = None, - additional_workflows: WorkflowDefinitions | None = None, - memory_profile: bool = False, - is_resume_run: bool = False, - **_kwargs: dict, -) -> AsyncIterable[PipelineRunResult]: - """Run the pipeline. - - Args: - - workflows - The workflows to run - - dataset - The dataset to run the pipeline on, specifically a dataframe with the following columns at a minimum: - - id - The id of the document - - text - The text of the document - - title - The title of the document - These must exist after any post process steps are run if there are any! - - storage - The storage to use for the pipeline - - cache - The cache to use for the pipeline - - progress_logger - The logger to use for the pipeline - - input_post_process_steps - The post process steps to run on the input data - - additional_verbs - The custom verbs to use for the pipeline - - additional_workflows - The custom workflows to use for the pipeline - - debug - Whether or not to run in debug mode - Returns: - - output - An iterable of workflow results as they complete running, as well as any errors that occur - """ - start_time = time.time() - - progress_reporter = progress_logger or NullProgressLogger() - callbacks = callbacks or [ConsoleWorkflowCallbacks()] - callback_chain = create_callback_chain(callbacks, progress_reporter) - context = create_run_context(storage=storage, cache=cache, stats=None) - exporter = ParquetExporter( - context.storage, - lambda e, s, d: cast("WorkflowCallbacks", callback_chain).on_error( - "Error exporting table", e, s, d - ), - ) - - loaded_workflows = load_workflows( - workflows, - additional_verbs=additional_verbs, - additional_workflows=additional_workflows, - memory_profile=memory_profile, - ) - workflows_to_run = loaded_workflows.workflows - workflow_dependencies = loaded_workflows.dependencies - dataset = await _run_post_process_steps( - input_post_process_steps, dataset, context, callback_chain - ) - - # ensure the incoming data is valid - _validate_dataset(dataset) - - log.info("Final # of rows loaded: %s", len(dataset)) - context.stats.num_documents = len(dataset) - last_workflow = "input" - - try: - await _dump_stats(context.stats, context.storage) - - for workflow_to_run in workflows_to_run: - # flush out any intermediate dataframes - gc.collect() - last_workflow = workflow_to_run.workflow.name - result = await _process_workflow( - workflow_to_run.workflow, - context, - callback_chain, - exporter, - workflow_dependencies, - dataset, - start_time, - is_resume_run, - ) - if result: - yield result - - context.stats.total_runtime = time.time() - start_time - await _dump_stats(context.stats, context.storage) - except Exception as e: - log.exception("error running workflow %s", last_workflow) - cast("WorkflowCallbacks", callback_chain).on_error( - "Error running pipeline!", e, traceback.format_exc() - ) - yield PipelineRunResult(last_workflow, None, [e]) diff --git a/graphrag/index/run/run_workflows.py b/graphrag/index/run/run_workflows.py index 7c27dda10b..c264462799 100644 --- a/graphrag/index/run/run_workflows.py +++ b/graphrag/index/run/run_workflows.py @@ -3,33 +3,38 @@ """Different methods to run the pipeline.""" +import json import logging import time import traceback from collections.abc import AsyncIterable -from typing import Any, cast +from dataclasses import asdict +from typing import cast -from datashaper import ( - DelegatingVerbCallbacks, - ExecutionNode, - VerbDetails, - WorkflowCallbacks, -) -from datashaper.progress.types import Progress +import pandas as pd from graphrag.cache.factory import CacheFactory from graphrag.cache.pipeline_cache import PipelineCache from graphrag.callbacks.console_workflow_callbacks import ConsoleWorkflowCallbacks +from graphrag.callbacks.delegating_verb_callbacks import DelegatingVerbCallbacks +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks +from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks from graphrag.config.models.graph_rag_config import GraphRagConfig +from graphrag.index.context import PipelineRunStats from graphrag.index.input.factory import create_input -from graphrag.index.run.profiling import _dump_stats -from graphrag.index.run.utils import create_run_context -from graphrag.index.run.workflow import create_callback_chain +from graphrag.index.run.utils import create_callback_chain, create_run_context from graphrag.index.typing import PipelineRunResult -from graphrag.index.workflows.default_workflows import basic_workflows +from graphrag.index.update.incremental_index import ( + get_delta_docs, + update_dataframe_outputs, +) +from graphrag.index.workflows import all_workflows from graphrag.logger.base import ProgressLogger from graphrag.logger.null_progress import NullProgressLogger +from graphrag.logger.progress import Progress from graphrag.storage.factory import StorageFactory +from graphrag.storage.pipeline_storage import PipelineStorage +from graphrag.utils.storage import delete_table_from_storage, write_table_to_storage log = logging.getLogger(__name__) @@ -38,17 +43,28 @@ "create_base_text_units", "create_final_documents", "extract_graph", - "create_final_covariates", "compute_communities", "create_final_entities", "create_final_relationships", "create_final_nodes", "create_final_communities", + "create_final_covariates", "create_final_text_units", "create_final_community_reports", "generate_text_embeddings", ] +# these are transient outputs written to storage for downstream workflow use +# they are not required after indexing, so we'll clean them up at the end for clarity +# (unless snapshots.transient is set!) +transient_outputs = [ + "input", + "base_communities", + "base_entity_nodes", + "base_relationship_edges", + "create_base_text_units", +] + async def run_workflows( config: GraphRagConfig, @@ -56,10 +72,9 @@ async def run_workflows( callbacks: list[WorkflowCallbacks] | None = None, logger: ProgressLogger | None = None, run_id: str | None = None, + is_update_run: bool = False, ) -> AsyncIterable[PipelineRunResult]: """Run all workflows using a simplified pipeline.""" - start_time = time.time() - run_id = run_id or time.strftime("%Y%m%d-%H%M%S") root_dir = config.root_dir or "" progress_logger = logger or NullProgressLogger() @@ -77,9 +92,75 @@ async def run_workflows( kwargs=cache_config, ) - context = create_run_context(storage=storage, cache=cache, stats=None) + dataset = await create_input(config.input, logger, root_dir) + + if is_update_run: + progress_logger.info("Running incremental indexing.") + + update_storage_config = config.update_index_storage.model_dump() # type: ignore + update_index_storage = StorageFactory().create_storage( + storage_type=update_storage_config["type"], # type: ignore + kwargs=update_storage_config, + ) + + delta_dataset = await get_delta_docs(dataset, storage) + + # Fail on empty delta dataset + if delta_dataset.new_inputs.empty: + error_msg = "Incremental Indexing Error: No new documents to process." + raise ValueError(error_msg) + + delta_storage = update_index_storage.child("delta") + + # Run the pipeline on the new documents + tables_dict = {} + async for table in _run_workflows( + config=config, + dataset=delta_dataset.new_inputs, + cache=cache, + storage=delta_storage, + callbacks=callback_chain, + logger=progress_logger, + ): + tables_dict[table.workflow] = table.result + + progress_logger.success("Finished running workflows on new documents.") + + await update_dataframe_outputs( + dataframe_dict=tables_dict, + storage=storage, + update_storage=update_index_storage, + config=config, + cache=cache, + callbacks=NoopVerbCallbacks(), + progress_logger=progress_logger, + ) - dataset = await create_input(config.input, progress_logger, root_dir) + else: + progress_logger.info("Running standard indexing.") + + async for table in _run_workflows( + config=config, + dataset=dataset, + cache=cache, + storage=storage, + callbacks=callback_chain, + logger=progress_logger, + ): + yield table + + +async def _run_workflows( + config: GraphRagConfig, + dataset: pd.DataFrame, + cache: PipelineCache, + storage: PipelineStorage, + callbacks: WorkflowCallbacks, + logger: ProgressLogger, +) -> AsyncIterable[PipelineRunResult]: + start_time = time.time() + + context = create_run_context(storage=storage, cache=cache, stats=None) log.info("Final # of rows loaded: %s", len(dataset)) context.stats.num_documents = len(dataset) @@ -87,28 +168,14 @@ async def run_workflows( try: await _dump_stats(context.stats, context.storage) - await context.runtime_storage.set("input", dataset) + await write_table_to_storage(dataset, "input", context.storage) for workflow in default_workflows: last_workflow = workflow - run_workflow = basic_workflows[workflow] - progress = progress_logger.child(workflow, transient=False) - callback_chain.on_workflow_start(workflow, None) - # TEMP: this structure is required for DataShaper downstream compliance - node = cast( - "Any", - ExecutionNode( - node_id=workflow, - has_explicit_id=True, - verb=VerbDetails( - name=workflow, - func=lambda x: x, - treats_input_tables_as_immutable=False, - ), - node_input="", - ), - ) - verb_callbacks = DelegatingVerbCallbacks(node, callback_chain) + run_workflow = all_workflows[workflow] + progress = logger.child(workflow, transient=False) + callbacks.on_workflow_start(workflow, None) + verb_callbacks = DelegatingVerbCallbacks(workflow, callbacks) work_time = time.time() result = await run_workflow( config, @@ -116,16 +183,28 @@ async def run_workflows( verb_callbacks, ) progress(Progress(percent=1)) - callback_chain.on_workflow_end(workflow, None) + callbacks.on_workflow_end(workflow, result) yield PipelineRunResult(workflow, result, None) context.stats.workflows[workflow] = {"overall": time.time() - work_time} context.stats.total_runtime = time.time() - start_time await _dump_stats(context.stats, context.storage) + + if not config.snapshots.transient: + for output in transient_outputs: + await delete_table_from_storage(output, context.storage) + except Exception as e: log.exception("error running workflow %s", last_workflow) - cast("WorkflowCallbacks", callback_chain).on_error( + cast("WorkflowCallbacks", callbacks).on_error( "Error running pipeline!", e, traceback.format_exc() ) yield PipelineRunResult(last_workflow, None, [e]) + + +async def _dump_stats(stats: PipelineRunStats, storage: PipelineStorage) -> None: + """Dump the stats to the storage.""" + await storage.set( + "stats.json", json.dumps(asdict(stats), indent=4, ensure_ascii=False) + ) diff --git a/graphrag/index/run/utils.py b/graphrag/index/run/utils.py index e78ee11179..04dd31df24 100644 --- a/graphrag/index/run/utils.py +++ b/graphrag/index/run/utils.py @@ -3,89 +3,16 @@ """Utility functions for the GraphRAG run module.""" -import logging -from string import Template -from typing import Any - -import pandas as pd - from graphrag.cache.memory_pipeline_cache import InMemoryCache from graphrag.cache.pipeline_cache import PipelineCache -from graphrag.index.config.cache import ( - PipelineBlobCacheConfig, - PipelineFileCacheConfig, -) -from graphrag.index.config.pipeline import PipelineConfig -from graphrag.index.config.reporting import ( - PipelineBlobReportingConfig, - PipelineFileReportingConfig, -) -from graphrag.index.config.storage import ( - PipelineBlobStorageConfig, - PipelineFileStorageConfig, -) +from graphrag.callbacks.progress_workflow_callbacks import ProgressWorkflowCallbacks +from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks +from graphrag.callbacks.workflow_callbacks_manager import WorkflowCallbacksManager from graphrag.index.context import PipelineRunContext, PipelineRunStats +from graphrag.logger.base import ProgressLogger from graphrag.storage.memory_pipeline_storage import MemoryPipelineStorage from graphrag.storage.pipeline_storage import PipelineStorage -log = logging.getLogger(__name__) - - -def _validate_dataset(dataset: Any): - """Validate the dataset for the pipeline. - - Args: - - dataset - The dataset to validate - """ - if not isinstance(dataset, pd.DataFrame): - msg = "Dataset must be a pandas dataframe!" - raise TypeError(msg) - - -def _apply_substitutions(config: PipelineConfig, run_id: str) -> PipelineConfig: - """Apply the substitutions to the configuration.""" - substitutions = {"timestamp": run_id} - - if ( - isinstance( - config.storage, PipelineFileStorageConfig | PipelineBlobStorageConfig - ) - and config.storage.base_dir - ): - config.storage.base_dir = Template(config.storage.base_dir).substitute( - substitutions - ) - if ( - config.update_index_storage - and isinstance( - config.update_index_storage, - PipelineFileStorageConfig | PipelineBlobStorageConfig, - ) - and config.update_index_storage.base_dir - ): - config.update_index_storage.base_dir = Template( - config.update_index_storage.base_dir - ).substitute(substitutions) - if ( - isinstance(config.cache, PipelineFileCacheConfig | PipelineBlobCacheConfig) - and config.cache.base_dir - ): - config.cache.base_dir = Template(config.cache.base_dir).substitute( - substitutions - ) - - if ( - isinstance( - config.reporting, PipelineFileReportingConfig | PipelineBlobReportingConfig - ) - and config.reporting.base_dir - ): - config.reporting.base_dir = Template(config.reporting.base_dir).substitute( - substitutions - ) - - return config - def create_run_context( storage: PipelineStorage | None, @@ -97,5 +24,16 @@ def create_run_context( stats=stats or PipelineRunStats(), cache=cache or InMemoryCache(), storage=storage or MemoryPipelineStorage(), - runtime_storage=MemoryPipelineStorage(), ) + + +def create_callback_chain( + callbacks: list[WorkflowCallbacks] | None, progress: ProgressLogger | None +) -> WorkflowCallbacks: + """Create a callback manager that encompasses multiple callbacks.""" + manager = WorkflowCallbacksManager() + for callback in callbacks or []: + manager.register(callback) + if progress is not None: + manager.register(ProgressWorkflowCallbacks(progress)) + return manager diff --git a/graphrag/index/run/workflow.py b/graphrag/index/run/workflow.py deleted file mode 100644 index d4ec70f077..0000000000 --- a/graphrag/index/run/workflow.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Workflow functions for the GraphRAG update module.""" - -import logging -import time -from typing import cast - -import pandas as pd -from datashaper import ( - DEFAULT_INPUT_NAME, - Workflow, - WorkflowCallbacks, - WorkflowCallbacksManager, -) - -from graphrag.callbacks.progress_workflow_callbacks import ProgressWorkflowCallbacks -from graphrag.index.config.pipeline import PipelineConfig -from graphrag.index.context import PipelineRunContext -from graphrag.index.exporter import ParquetExporter -from graphrag.index.run.profiling import _write_workflow_stats -from graphrag.index.typing import PipelineRunResult -from graphrag.logger.base import ProgressLogger -from graphrag.storage.pipeline_storage import PipelineStorage -from graphrag.utils.storage import load_table_from_storage - -log = logging.getLogger(__name__) - - -async def _inject_workflow_data_dependencies( - workflow: Workflow, - workflow_dependencies: dict[str, list[str]], - dataset: pd.DataFrame, - storage: PipelineStorage, -) -> None: - """Inject the data dependencies into the workflow.""" - workflow.add_table(DEFAULT_INPUT_NAME, dataset) - deps = workflow_dependencies[workflow.name] - log.info("dependencies for %s: %s", workflow.name, deps) - for id in deps: - workflow_id = f"workflow:{id}" - try: - table = await load_table_from_storage(f"{id}.parquet", storage) - except ValueError: - # our workflows allow for transient tables, and we avoid putting those in storage - # however, we need to keep the table in the dependency list for proper execution order. - # this allows us to catch missing table errors and issue a warning for pipeline users who may genuinely have an error (which we expect to be very rare) - # todo: this issue will resolve itself once we remove DataShaper completely - log.warning( - "Dependency table %s not found in storage: it may be a runtime-only in-memory table. If you see further errors, this may be an actual problem.", - id, - ) - table = pd.DataFrame() - workflow.add_table(workflow_id, table) - - -async def _export_workflow_output( - workflow: Workflow, exporter: ParquetExporter -) -> pd.DataFrame: - """Export the output from each step of the workflow.""" - output = cast("pd.DataFrame", workflow.output()) - # only write final output that is not empty (i.e. has content) - # NOTE: this design is intentional - it accounts for workflow steps with "side effects" that don't produce a formal output to save - if not output.empty: - await exporter.export(workflow.name, output) - return output - - -def create_callback_chain( - callbacks: list[WorkflowCallbacks] | None, progress: ProgressLogger | None -) -> WorkflowCallbacks: - """Create a callback manager that encompasses multiple callbacks.""" - manager = WorkflowCallbacksManager() - for callback in callbacks or []: - manager.register(callback) - if progress is not None: - manager.register(ProgressWorkflowCallbacks(progress)) - return manager - - -async def _process_workflow( - workflow: Workflow, - context: PipelineRunContext, - callbacks: WorkflowCallbacks, - exporter: ParquetExporter, - workflow_dependencies: dict[str, list[str]], - dataset: pd.DataFrame, - start_time: float, - is_resume_run: bool, -): - workflow_name = workflow.name - if is_resume_run and await context.storage.has(f"{workflow_name}.parquet"): - log.info("Skipping %s because it already exists", workflow_name) - return None - - context.stats.workflows[workflow_name] = {"overall": 0.0} - - await _inject_workflow_data_dependencies( - workflow, - workflow_dependencies, - dataset, - context.storage, - ) - - workflow_start_time = time.time() - result = await workflow.run(context, callbacks) - await _write_workflow_stats( - workflow, - result, - workflow_start_time, - start_time, - context.stats, - context.storage, - ) - - # Save the output from the workflow - output = await _export_workflow_output(workflow, exporter) - workflow.dispose() - return PipelineRunResult(workflow_name, output, None) - - -def _find_workflow_config( - config: PipelineConfig, workflow_name: str, step: str | None = None -) -> dict: - """Find a workflow in the pipeline configuration. - - Parameters - ---------- - config : PipelineConfig - The pipeline configuration. - workflow_name : str - The name of the workflow. - step : str - The step in the workflow. - - Returns - ------- - dict - The workflow configuration. - """ - try: - workflow = next( - filter(lambda workflow: workflow.name == workflow_name, config.workflows) - ) - except StopIteration as err: - error_message = ( - f"Workflow {workflow_name} not found in the pipeline configuration." - ) - raise ValueError(error_message) from err - - if not workflow.config: - return {} - return workflow.config if not step else workflow.config.get(step, {}) diff --git a/graphrag/index/update/entities.py b/graphrag/index/update/entities.py index 3c117ac837..849fa4a749 100644 --- a/graphrag/index/update/entities.py +++ b/graphrag/index/update/entities.py @@ -8,14 +8,13 @@ import numpy as np import pandas as pd -from datashaper import VerbCallbacks from graphrag.cache.pipeline_cache import PipelineCache -from graphrag.index.config.pipeline import PipelineConfig +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.models.graph_rag_config import GraphRagConfig from graphrag.index.operations.summarize_descriptions.graph_intelligence_strategy import ( run_graph_intelligence as run_entity_summarization, ) -from graphrag.index.run.workflow import _find_workflow_config def _group_and_resolve_entities( @@ -91,7 +90,7 @@ def _group_and_resolve_entities( async def _run_entity_summarization( entities_df: pd.DataFrame, - config: PipelineConfig, + config: GraphRagConfig, cache: PipelineCache, callbacks: VerbCallbacks, ) -> pd.DataFrame: @@ -101,7 +100,7 @@ async def _run_entity_summarization( ---------- entities_df : pd.DataFrame The entities dataframe. - config : PipelineConfig + config : GraphRagConfig The pipeline configuration. cache : PipelineCache Pipeline cache used during the summarization process. @@ -111,10 +110,9 @@ async def _run_entity_summarization( pd.DataFrame The updated entities dataframe with summarized descriptions. """ - summarize_config = _find_workflow_config( - config, "extract_graph", "summarize_descriptions" + summarization_strategy = config.summarize_descriptions.resolved_strategy( + config.root_dir, ) - strategy = summarize_config.get("strategy", {}) # Prepare tasks for async summarization where needed async def process_row(row): @@ -122,7 +120,7 @@ async def process_row(row): if isinstance(description, list) and len(description) > 1: # Run entity summarization asynchronously result = await run_entity_summarization( - row["title"], description, callbacks, cache, strategy + row["title"], description, callbacks, cache, summarization_strategy ) return result.description # Handle case where description is a single-item list or not a list diff --git a/graphrag/index/update/incremental_index.py b/graphrag/index/update/incremental_index.py index 05da47f01d..4ba486af6b 100644 --- a/graphrag/index/update/incremental_index.py +++ b/graphrag/index/update/incremental_index.py @@ -7,12 +7,12 @@ import numpy as np import pandas as pd -from datashaper import VerbCallbacks from graphrag.cache.pipeline_cache import PipelineCache -from graphrag.index.config.pipeline import PipelineConfig +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.models.graph_rag_config import GraphRagConfig +from graphrag.index.config.embeddings import get_embedded_fields, get_embedding_settings from graphrag.index.flows.generate_text_embeddings import generate_text_embeddings -from graphrag.index.run.workflow import _find_workflow_config from graphrag.index.update.communities import ( _merge_and_resolve_nodes, _update_and_merge_communities, @@ -25,7 +25,11 @@ from graphrag.index.update.relationships import _update_and_merge_relationships from graphrag.logger.print_progress import ProgressLogger from graphrag.storage.pipeline_storage import PipelineStorage -from graphrag.utils.storage import load_table_from_storage +from graphrag.utils.storage import ( + load_table_from_storage, + storage_has_table, + write_table_to_storage, +) @dataclass @@ -61,9 +65,7 @@ async def get_delta_docs( InputDelta The input delta. With new inputs and deleted inputs. """ - final_docs = await load_table_from_storage( - "create_final_documents.parquet", storage - ) + final_docs = await load_table_from_storage("create_final_documents", storage) # Select distinct title from final docs and from dataset previous_docs: list[str] = final_docs["title"].unique().tolist() @@ -82,7 +84,7 @@ async def update_dataframe_outputs( dataframe_dict: dict[str, pd.DataFrame], storage: PipelineStorage, update_storage: PipelineStorage, - config: PipelineConfig, + config: GraphRagConfig, cache: PipelineCache, callbacks: VerbCallbacks, progress_logger: ProgressLogger, @@ -121,7 +123,7 @@ async def update_dataframe_outputs( # Merge final covariates if ( - await storage.has("create_final_covariates.parquet") + await storage_has_table("create_final_covariates", storage) and "create_final_covariates" in dataframe_dict ): progress_logger.info("Updating Final Covariates") @@ -145,13 +147,10 @@ async def update_dataframe_outputs( dataframe_dict, storage, update_storage, community_id_mapping ) - # Extract the embeddings config - embeddings_config = _find_workflow_config( - config=config, workflow_name="generate_text_embeddings" - ) - # Generate text embeddings progress_logger.info("Updating Text Embeddings") + embedded_fields = get_embedded_fields(config) + text_embed = get_embedding_settings(config.embeddings) await generate_text_embeddings( final_documents=final_documents_df, final_relationships=merged_relationships_df, @@ -161,9 +160,9 @@ async def update_dataframe_outputs( callbacks=callbacks, cache=cache, storage=update_storage, - text_embed_config=embeddings_config.get("text_embed", {}), - embedded_fields=embeddings_config.get("embedded_fields", {}), - snapshot_embeddings_enabled=embeddings_config.get("snapshot_embeddings", False), + text_embed_config=text_embed, + embedded_fields=embedded_fields, + snapshot_embeddings_enabled=config.snapshots.embeddings, ) @@ -172,7 +171,7 @@ async def _update_community_reports( ): """Update the community reports output.""" old_community_reports = await load_table_from_storage( - "create_final_community_reports.parquet", storage + "create_final_community_reports", storage ) delta_community_reports = dataframe_dict["create_final_community_reports"] @@ -180,9 +179,8 @@ async def _update_community_reports( old_community_reports, delta_community_reports, community_id_mapping ) - await update_storage.set( - "create_final_community_reports.parquet", - merged_community_reports.to_parquet(), + await write_table_to_storage( + merged_community_reports, "create_final_community_reports", update_storage ) return merged_community_reports @@ -192,42 +190,40 @@ async def _update_communities( dataframe_dict, storage, update_storage, community_id_mapping ): """Update the communities output.""" - old_communities = await load_table_from_storage( - "create_final_communities.parquet", storage - ) + old_communities = await load_table_from_storage("create_final_communities", storage) delta_communities = dataframe_dict["create_final_communities"] merged_communities = _update_and_merge_communities( old_communities, delta_communities, community_id_mapping ) - await update_storage.set( - "create_final_communities.parquet", merged_communities.to_parquet() + await write_table_to_storage( + merged_communities, "create_final_communities", update_storage ) async def _update_nodes(dataframe_dict, storage, update_storage, merged_entities_df): """Update the nodes output.""" - old_nodes = await load_table_from_storage("create_final_nodes.parquet", storage) + old_nodes = await load_table_from_storage("create_final_nodes", storage) delta_nodes = dataframe_dict["create_final_nodes"] merged_nodes, community_id_mapping = _merge_and_resolve_nodes( old_nodes, delta_nodes, merged_entities_df ) - await update_storage.set("create_final_nodes.parquet", merged_nodes.to_parquet()) + await write_table_to_storage(merged_nodes, "create_final_nodes", update_storage) + return merged_nodes, community_id_mapping async def _update_covariates(dataframe_dict, storage, update_storage): """Update the covariates output.""" - old_covariates = await load_table_from_storage( - "create_final_covariates.parquet", storage - ) + old_covariates = await load_table_from_storage("create_final_covariates", storage) delta_covariates = dataframe_dict["create_final_covariates"] merged_covariates = _merge_covariates(old_covariates, delta_covariates) - await update_storage.set( - "create_final_covariates.parquet", merged_covariates.to_parquet() + + await write_table_to_storage( + merged_covariates, "create_final_covariates", update_storage ) @@ -235,17 +231,15 @@ async def _update_text_units( dataframe_dict, storage, update_storage, entity_id_mapping ): """Update the text units output.""" - old_text_units = await load_table_from_storage( - "create_final_text_units.parquet", storage - ) + old_text_units = await load_table_from_storage("create_final_text_units", storage) delta_text_units = dataframe_dict["create_final_text_units"] merged_text_units = _update_and_merge_text_units( old_text_units, delta_text_units, entity_id_mapping ) - await update_storage.set( - "create_final_text_units.parquet", merged_text_units.to_parquet() + await write_table_to_storage( + merged_text_units, "create_final_text_units", update_storage ) return merged_text_units @@ -254,7 +248,7 @@ async def _update_text_units( async def _update_relationships(dataframe_dict, storage, update_storage): """Update the relationships output.""" old_relationships = await load_table_from_storage( - "create_final_relationships.parquet", storage + "create_final_relationships", storage ) delta_relationships = dataframe_dict["create_final_relationships"] merged_relationships_df = _update_and_merge_relationships( @@ -262,8 +256,8 @@ async def _update_relationships(dataframe_dict, storage, update_storage): delta_relationships, ) - await update_storage.set( - "create_final_relationships.parquet", merged_relationships_df.to_parquet() + await write_table_to_storage( + merged_relationships_df, "create_final_relationships", update_storage ) return merged_relationships_df @@ -273,9 +267,7 @@ async def _update_entities( dataframe_dict, storage, update_storage, config, cache, callbacks ): """Update Final Entities output.""" - old_entities = await load_table_from_storage( - "create_final_entities.parquet", storage - ) + old_entities = await load_table_from_storage("create_final_entities", storage) delta_entities = dataframe_dict["create_final_entities"] merged_entities_df, entity_id_mapping = _group_and_resolve_entities( @@ -291,8 +283,8 @@ async def _update_entities( ) # Save the updated entities back to storage - await update_storage.set( - "create_final_entities.parquet", merged_entities_df.to_parquet() + await write_table_to_storage( + merged_entities_df, "create_final_entities", update_storage ) return merged_entities_df, entity_id_mapping @@ -310,13 +302,14 @@ async def _concat_dataframes(name, dataframe_dict, storage, update_storage): storage : PipelineStorage The storage used to store the dataframes. """ - old_df = await load_table_from_storage(f"{name}.parquet", storage) + old_df = await load_table_from_storage(name, storage) delta_df = dataframe_dict[name] # Merge the final documents final_df = pd.concat([old_df, delta_df], copy=False) - await update_storage.set(f"{name}.parquet", final_df.to_parquet()) + await write_table_to_storage(final_df, name, update_storage) + return final_df diff --git a/graphrag/index/utils/ds_util.py b/graphrag/index/utils/ds_util.py deleted file mode 100644 index e59d30754f..0000000000 --- a/graphrag/index/utils/ds_util.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A utility module datashaper-specific utility methods.""" - -from typing import cast - -from datashaper import TableContainer, VerbInput - -_NAMED_INPUTS_REQUIRED = "Named inputs are required" - - -def get_required_input_table(input: VerbInput, name: str) -> TableContainer: - """Get a required input table by name.""" - return cast("TableContainer", get_named_input_table(input, name, required=True)) - - -def get_named_input_table( - input: VerbInput, name: str, required: bool = False -) -> TableContainer | None: - """Get an input table from datashaper verb-inputs by name.""" - named_inputs = input.named - if named_inputs is None: - if not required: - return None - raise ValueError(_NAMED_INPUTS_REQUIRED) - - result = named_inputs.get(name) - if result is None and required: - msg = f"input '${name}' is required" - raise ValueError(msg) - return result diff --git a/graphrag/index/utils/load_graph.py b/graphrag/index/utils/load_graph.py deleted file mode 100644 index 57992a04c8..0000000000 --- a/graphrag/index/utils/load_graph.py +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Networkx load_graph utility definition.""" - -import networkx as nx - - -def load_graph(graphml: str | nx.Graph) -> nx.Graph: - """Load a graph from a graphml file or a networkx graph.""" - return nx.parse_graphml(graphml) if isinstance(graphml, str) else graphml diff --git a/graphrag/index/utils/topological_sort.py b/graphrag/index/utils/topological_sort.py deleted file mode 100644 index a19b464559..0000000000 --- a/graphrag/index/utils/topological_sort.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Topological sort utility method.""" - -from graphlib import TopologicalSorter - - -def topological_sort(graph: dict[str, list[str]]) -> list[str]: - """Topological sort.""" - ts = TopologicalSorter(graph) - return list(ts.static_order()) diff --git a/graphrag/index/validate_config.py b/graphrag/index/validate_config.py index 07e4638fc3..a98e4cb707 100644 --- a/graphrag/index/validate_config.py +++ b/graphrag/index/validate_config.py @@ -6,8 +6,7 @@ import asyncio import sys -from datashaper import NoopVerbCallbacks - +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks from graphrag.config.models.graph_rag_config import GraphRagConfig from graphrag.index.llm.load_llm import load_llm, load_llm_embeddings from graphrag.logger.print_progress import ProgressLogger diff --git a/graphrag/index/workflows/__init__.py b/graphrag/index/workflows/__init__.py index db1cb74c7b..a904dc7bb8 100644 --- a/graphrag/index/workflows/__init__.py +++ b/graphrag/index/workflows/__init__.py @@ -1,25 +1,108 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""The Indexing Engine workflows package root.""" -from graphrag.index.workflows.load import create_workflow, load_workflows -from graphrag.index.workflows.typing import ( - StepDefinition, - VerbDefinitions, - VerbTiming, - WorkflowConfig, - WorkflowDefinitions, - WorkflowToRun, +"""A package containing all built-in workflow definitions.""" + +from collections.abc import Awaitable, Callable + +import pandas as pd + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.models.graph_rag_config import GraphRagConfig +from graphrag.index.context import PipelineRunContext + +from .compute_communities import ( + run_workflow as run_compute_communities, +) +from .compute_communities import ( + workflow_name as compute_communities, +) +from .create_base_text_units import ( + run_workflow as run_create_base_text_units, +) +from .create_base_text_units import ( + workflow_name as create_base_text_units, +) +from .create_final_communities import ( + run_workflow as run_create_final_communities, +) +from .create_final_communities import ( + workflow_name as create_final_communities, +) +from .create_final_community_reports import ( + run_workflow as run_create_final_community_reports, +) +from .create_final_community_reports import ( + workflow_name as create_final_community_reports, +) +from .create_final_covariates import ( + run_workflow as run_create_final_covariates, +) +from .create_final_covariates import ( + workflow_name as create_final_covariates, +) +from .create_final_documents import ( + run_workflow as run_create_final_documents, +) +from .create_final_documents import ( + workflow_name as create_final_documents, +) +from .create_final_entities import ( + run_workflow as run_create_final_entities, +) +from .create_final_entities import ( + workflow_name as create_final_entities, +) +from .create_final_nodes import ( + run_workflow as run_create_final_nodes, +) +from .create_final_nodes import ( + workflow_name as create_final_nodes, +) +from .create_final_relationships import ( + run_workflow as run_create_final_relationships, +) +from .create_final_relationships import ( + workflow_name as create_final_relationships, +) +from .create_final_text_units import ( + run_workflow as run_create_final_text_units, +) +from .create_final_text_units import ( + workflow_name as create_final_text_units, +) +from .extract_graph import ( + run_workflow as run_extract_graph, +) +from .extract_graph import ( + workflow_name as extract_graph, +) +from .generate_text_embeddings import ( + run_workflow as run_generate_text_embeddings, +) +from .generate_text_embeddings import ( + workflow_name as generate_text_embeddings, ) -__all__ = [ - "StepDefinition", - "VerbDefinitions", - "VerbTiming", - "WorkflowConfig", - "WorkflowDefinitions", - "WorkflowToRun", - "create_workflow", - "load_workflows", -] +all_workflows: dict[ + str, + Callable[ + [GraphRagConfig, PipelineRunContext, VerbCallbacks], + Awaitable[pd.DataFrame | None], + ], +] = { + compute_communities: run_compute_communities, + create_base_text_units: run_create_base_text_units, + create_final_communities: run_create_final_communities, + create_final_community_reports: run_create_final_community_reports, + create_final_covariates: run_create_final_covariates, + create_final_documents: run_create_final_documents, + create_final_entities: run_create_final_entities, + create_final_nodes: run_create_final_nodes, + create_final_relationships: run_create_final_relationships, + create_final_text_units: run_create_final_text_units, + extract_graph: run_extract_graph, + generate_text_embeddings: run_generate_text_embeddings, +} +"""This is a dictionary of all build-in workflows. To be replace with an injectable provider!""" diff --git a/graphrag/index/workflows/compute_communities.py b/graphrag/index/workflows/compute_communities.py new file mode 100644 index 0000000000..51cf511d50 --- /dev/null +++ b/graphrag/index/workflows/compute_communities.py @@ -0,0 +1,40 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing run_workflow method definition.""" + +import pandas as pd + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.models.graph_rag_config import GraphRagConfig +from graphrag.index.context import PipelineRunContext +from graphrag.index.flows.compute_communities import compute_communities +from graphrag.utils.storage import load_table_from_storage, write_table_to_storage + +workflow_name = "compute_communities" + + +async def run_workflow( + config: GraphRagConfig, + context: PipelineRunContext, + _callbacks: VerbCallbacks, +) -> pd.DataFrame | None: + """All the steps to create the base communities.""" + base_relationship_edges = await load_table_from_storage( + "base_relationship_edges", context.storage + ) + + max_cluster_size = config.cluster_graph.max_cluster_size + use_lcc = config.cluster_graph.use_lcc + seed = config.cluster_graph.seed + + base_communities = compute_communities( + base_relationship_edges, + max_cluster_size=max_cluster_size, + use_lcc=use_lcc, + seed=seed, + ) + + await write_table_to_storage(base_communities, "base_communities", context.storage) + + return base_communities diff --git a/graphrag/index/workflows/create_base_text_units.py b/graphrag/index/workflows/create_base_text_units.py new file mode 100644 index 0000000000..91d5822884 --- /dev/null +++ b/graphrag/index/workflows/create_base_text_units.py @@ -0,0 +1,41 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing run_workflow method definition.""" + +import pandas as pd + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.models.graph_rag_config import GraphRagConfig +from graphrag.index.context import PipelineRunContext +from graphrag.index.flows.create_base_text_units import ( + create_base_text_units, +) +from graphrag.utils.storage import load_table_from_storage, write_table_to_storage + +workflow_name = "create_base_text_units" + + +async def run_workflow( + config: GraphRagConfig, + context: PipelineRunContext, + callbacks: VerbCallbacks, +) -> pd.DataFrame | None: + """All the steps to transform base text_units.""" + documents = await load_table_from_storage("input", context.storage) + + chunks = config.chunks + + output = create_base_text_units( + documents, + callbacks, + chunks.group_by_columns, + chunks.size, + chunks.overlap, + chunks.encoding_model, + strategy=chunks.strategy, + ) + + await write_table_to_storage(output, workflow_name, context.storage) + + return output diff --git a/graphrag/index/workflows/create_final_communities.py b/graphrag/index/workflows/create_final_communities.py new file mode 100644 index 0000000000..e1cf950e97 --- /dev/null +++ b/graphrag/index/workflows/create_final_communities.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing run_workflow method definition.""" + +import pandas as pd + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.models.graph_rag_config import GraphRagConfig +from graphrag.index.context import PipelineRunContext +from graphrag.index.flows.create_final_communities import ( + create_final_communities, +) +from graphrag.utils.storage import load_table_from_storage, write_table_to_storage + +workflow_name = "create_final_communities" + + +async def run_workflow( + _config: GraphRagConfig, + context: PipelineRunContext, + _callbacks: VerbCallbacks, +) -> pd.DataFrame | None: + """All the steps to transform final communities.""" + base_entity_nodes = await load_table_from_storage( + "base_entity_nodes", context.storage + ) + base_relationship_edges = await load_table_from_storage( + "base_relationship_edges", context.storage + ) + base_communities = await load_table_from_storage( + "base_communities", context.storage + ) + + output = create_final_communities( + base_entity_nodes, + base_relationship_edges, + base_communities, + ) + + await write_table_to_storage(output, workflow_name, context.storage) + + return output diff --git a/graphrag/index/workflows/create_final_community_reports.py b/graphrag/index/workflows/create_final_community_reports.py new file mode 100644 index 0000000000..ee45b95a49 --- /dev/null +++ b/graphrag/index/workflows/create_final_community_reports.py @@ -0,0 +1,51 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing run_workflow method definition.""" + +import pandas as pd + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.models.graph_rag_config import GraphRagConfig +from graphrag.index.context import PipelineRunContext +from graphrag.index.flows.create_final_community_reports import ( + create_final_community_reports, +) +from graphrag.utils.storage import load_table_from_storage, write_table_to_storage + +workflow_name = "create_final_community_reports" + + +async def run_workflow( + config: GraphRagConfig, + context: PipelineRunContext, + callbacks: VerbCallbacks, +) -> pd.DataFrame | None: + """All the steps to transform community reports.""" + nodes = await load_table_from_storage("create_final_nodes", context.storage) + edges = await load_table_from_storage("create_final_relationships", context.storage) + entities = await load_table_from_storage("create_final_entities", context.storage) + communities = await load_table_from_storage( + "create_final_communities", context.storage + ) + claims = await load_table_from_storage("create_final_covariates", context.storage) + async_mode = config.community_reports.async_mode + num_threads = config.community_reports.parallelization.num_threads + summarization_strategy = config.community_reports.resolved_strategy(config.root_dir) + + output = await create_final_community_reports( + nodes, + edges, + entities, + communities, + claims, + callbacks, + context.cache, + summarization_strategy, + async_mode=async_mode, + num_threads=num_threads, + ) + + await write_table_to_storage(output, workflow_name, context.storage) + + return output diff --git a/graphrag/index/workflows/create_final_covariates.py b/graphrag/index/workflows/create_final_covariates.py new file mode 100644 index 0000000000..9ab91fdf16 --- /dev/null +++ b/graphrag/index/workflows/create_final_covariates.py @@ -0,0 +1,49 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing run_workflow method definition.""" + +import pandas as pd + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.models.graph_rag_config import GraphRagConfig +from graphrag.index.context import PipelineRunContext +from graphrag.index.flows.create_final_covariates import ( + create_final_covariates, +) +from graphrag.utils.storage import load_table_from_storage, write_table_to_storage + +workflow_name = "create_final_covariates" + + +async def run_workflow( + config: GraphRagConfig, + context: PipelineRunContext, + callbacks: VerbCallbacks, +) -> pd.DataFrame | None: + """All the steps to extract and format covariates.""" + text_units = await load_table_from_storage( + "create_base_text_units", context.storage + ) + + extraction_strategy = config.claim_extraction.resolved_strategy( + config.root_dir, config.encoding_model + ) + + async_mode = config.claim_extraction.async_mode + num_threads = config.claim_extraction.parallelization.num_threads + + output = await create_final_covariates( + text_units, + callbacks, + context.cache, + "claim", + extraction_strategy, + async_mode=async_mode, + entity_types=None, + num_threads=num_threads, + ) + + await write_table_to_storage(output, workflow_name, context.storage) + + return output diff --git a/graphrag/index/workflows/create_final_documents.py b/graphrag/index/workflows/create_final_documents.py new file mode 100644 index 0000000000..bbc1490b8f --- /dev/null +++ b/graphrag/index/workflows/create_final_documents.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing run_workflow method definition.""" + +import pandas as pd + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.models.graph_rag_config import GraphRagConfig +from graphrag.index.context import PipelineRunContext +from graphrag.index.flows.create_final_documents import ( + create_final_documents, +) +from graphrag.utils.storage import load_table_from_storage, write_table_to_storage + +workflow_name = "create_final_documents" + + +async def run_workflow( + config: GraphRagConfig, + context: PipelineRunContext, + _callbacks: VerbCallbacks, +) -> pd.DataFrame | None: + """All the steps to transform final documents.""" + documents = await load_table_from_storage("input", context.storage) + text_units = await load_table_from_storage( + "create_base_text_units", context.storage + ) + + input = config.input + output = create_final_documents( + documents, text_units, input.document_attribute_columns + ) + + await write_table_to_storage(output, workflow_name, context.storage) + + return output diff --git a/graphrag/index/workflows/create_final_entities.py b/graphrag/index/workflows/create_final_entities.py new file mode 100644 index 0000000000..565da6cf6b --- /dev/null +++ b/graphrag/index/workflows/create_final_entities.py @@ -0,0 +1,33 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing run_workflow method definition.""" + +import pandas as pd + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.models.graph_rag_config import GraphRagConfig +from graphrag.index.context import PipelineRunContext +from graphrag.index.flows.create_final_entities import ( + create_final_entities, +) +from graphrag.utils.storage import load_table_from_storage, write_table_to_storage + +workflow_name = "create_final_entities" + + +async def run_workflow( + _config: GraphRagConfig, + context: PipelineRunContext, + _callbacks: VerbCallbacks, +) -> pd.DataFrame | None: + """All the steps to transform final entities.""" + base_entity_nodes = await load_table_from_storage( + "base_entity_nodes", context.storage + ) + + output = create_final_entities(base_entity_nodes) + + await write_table_to_storage(output, workflow_name, context.storage) + + return output diff --git a/graphrag/index/workflows/create_final_nodes.py b/graphrag/index/workflows/create_final_nodes.py new file mode 100644 index 0000000000..aa1ec3c177 --- /dev/null +++ b/graphrag/index/workflows/create_final_nodes.py @@ -0,0 +1,49 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing run_workflow method definition.""" + +import pandas as pd + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.models.graph_rag_config import GraphRagConfig +from graphrag.index.context import PipelineRunContext +from graphrag.index.flows.create_final_nodes import ( + create_final_nodes, +) +from graphrag.utils.storage import load_table_from_storage, write_table_to_storage + +workflow_name = "create_final_nodes" + + +async def run_workflow( + config: GraphRagConfig, + context: PipelineRunContext, + callbacks: VerbCallbacks, +) -> pd.DataFrame | None: + """All the steps to transform final nodes.""" + base_entity_nodes = await load_table_from_storage( + "base_entity_nodes", context.storage + ) + base_relationship_edges = await load_table_from_storage( + "base_relationship_edges", context.storage + ) + base_communities = await load_table_from_storage( + "base_communities", context.storage + ) + + embed_config = config.embed_graph + layout_enabled = config.umap.enabled + + output = create_final_nodes( + base_entity_nodes, + base_relationship_edges, + base_communities, + callbacks, + embed_config=embed_config, + layout_enabled=layout_enabled, + ) + + await write_table_to_storage(output, workflow_name, context.storage) + + return output diff --git a/graphrag/index/workflows/create_final_relationships.py b/graphrag/index/workflows/create_final_relationships.py new file mode 100644 index 0000000000..f6896420b0 --- /dev/null +++ b/graphrag/index/workflows/create_final_relationships.py @@ -0,0 +1,33 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing run_workflow method definition.""" + +import pandas as pd + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.models.graph_rag_config import GraphRagConfig +from graphrag.index.context import PipelineRunContext +from graphrag.index.flows.create_final_relationships import ( + create_final_relationships, +) +from graphrag.utils.storage import load_table_from_storage, write_table_to_storage + +workflow_name = "create_final_relationships" + + +async def run_workflow( + _config: GraphRagConfig, + context: PipelineRunContext, + _callbacks: VerbCallbacks, +) -> pd.DataFrame | None: + """All the steps to transform final relationships.""" + base_relationship_edges = await load_table_from_storage( + "base_relationship_edges", context.storage + ) + + output = create_final_relationships(base_relationship_edges) + + await write_table_to_storage(output, workflow_name, context.storage) + + return output diff --git a/graphrag/index/workflows/create_final_text_units.py b/graphrag/index/workflows/create_final_text_units.py new file mode 100644 index 0000000000..54d974db4d --- /dev/null +++ b/graphrag/index/workflows/create_final_text_units.py @@ -0,0 +1,47 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing run_workflow method definition.""" + +import pandas as pd + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.models.graph_rag_config import GraphRagConfig +from graphrag.index.context import PipelineRunContext +from graphrag.index.flows.create_final_text_units import ( + create_final_text_units, +) +from graphrag.utils.storage import load_table_from_storage, write_table_to_storage + +workflow_name = "create_final_text_units" + + +async def run_workflow( + _config: GraphRagConfig, + context: PipelineRunContext, + _callbacks: VerbCallbacks, +) -> pd.DataFrame | None: + """All the steps to transform the text units.""" + text_units = await load_table_from_storage( + "create_base_text_units", context.storage + ) + final_entities = await load_table_from_storage( + "create_final_entities", context.storage + ) + final_relationships = await load_table_from_storage( + "create_final_relationships", context.storage + ) + final_covariates = await load_table_from_storage( + "create_final_covariates", context.storage + ) + + output = create_final_text_units( + text_units, + final_entities, + final_relationships, + final_covariates, + ) + + await write_table_to_storage(output, workflow_name, context.storage) + + return output diff --git a/graphrag/index/workflows/default_workflows.py b/graphrag/index/workflows/default_workflows.py deleted file mode 100644 index 173b155e49..0000000000 --- a/graphrag/index/workflows/default_workflows.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A package containing default workflows definitions.""" - -from collections.abc import Awaitable, Callable - -import pandas as pd -from datashaper import VerbCallbacks - -from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.context import PipelineRunContext -from graphrag.index.workflows.typing import WorkflowDefinitions -from graphrag.index.workflows.v1.compute_communities import ( - build_steps as build_compute_communities_steps, -) -from graphrag.index.workflows.v1.compute_communities import ( - run_workflow as run_compute_communities, -) -from graphrag.index.workflows.v1.compute_communities import ( - workflow_name as compute_communities, -) -from graphrag.index.workflows.v1.create_base_text_units import ( - build_steps as build_create_base_text_units_steps, -) -from graphrag.index.workflows.v1.create_base_text_units import ( - run_workflow as run_create_base_text_units, -) -from graphrag.index.workflows.v1.create_base_text_units import ( - workflow_name as create_base_text_units, -) -from graphrag.index.workflows.v1.create_final_communities import ( - build_steps as build_create_final_communities_steps, -) -from graphrag.index.workflows.v1.create_final_communities import ( - run_workflow as run_create_final_communities, -) -from graphrag.index.workflows.v1.create_final_communities import ( - workflow_name as create_final_communities, -) -from graphrag.index.workflows.v1.create_final_community_reports import ( - build_steps as build_create_final_community_reports_steps, -) -from graphrag.index.workflows.v1.create_final_community_reports import ( - run_workflow as run_create_final_community_reports, -) -from graphrag.index.workflows.v1.create_final_community_reports import ( - workflow_name as create_final_community_reports, -) -from graphrag.index.workflows.v1.create_final_covariates import ( - build_steps as build_create_final_covariates_steps, -) -from graphrag.index.workflows.v1.create_final_covariates import ( - run_workflow as run_create_final_covariates, -) -from graphrag.index.workflows.v1.create_final_covariates import ( - workflow_name as create_final_covariates, -) -from graphrag.index.workflows.v1.create_final_documents import ( - build_steps as build_create_final_documents_steps, -) -from graphrag.index.workflows.v1.create_final_documents import ( - run_workflow as run_create_final_documents, -) -from graphrag.index.workflows.v1.create_final_documents import ( - workflow_name as create_final_documents, -) -from graphrag.index.workflows.v1.create_final_entities import ( - build_steps as build_create_final_entities_steps, -) -from graphrag.index.workflows.v1.create_final_entities import ( - run_workflow as run_create_final_entities, -) -from graphrag.index.workflows.v1.create_final_entities import ( - workflow_name as create_final_entities, -) -from graphrag.index.workflows.v1.create_final_nodes import ( - build_steps as build_create_final_nodes_steps, -) -from graphrag.index.workflows.v1.create_final_nodes import ( - run_workflow as run_create_final_nodes, -) -from graphrag.index.workflows.v1.create_final_nodes import ( - workflow_name as create_final_nodes, -) -from graphrag.index.workflows.v1.create_final_relationships import ( - build_steps as build_create_final_relationships_steps, -) -from graphrag.index.workflows.v1.create_final_relationships import ( - run_workflow as run_create_final_relationships, -) -from graphrag.index.workflows.v1.create_final_relationships import ( - workflow_name as create_final_relationships, -) -from graphrag.index.workflows.v1.create_final_text_units import ( - build_steps as build_create_final_text_units, -) -from graphrag.index.workflows.v1.create_final_text_units import ( - run_workflow as run_create_final_text_units, -) -from graphrag.index.workflows.v1.create_final_text_units import ( - workflow_name as create_final_text_units, -) -from graphrag.index.workflows.v1.extract_graph import ( - build_steps as build_extract_graph_steps, -) -from graphrag.index.workflows.v1.extract_graph import ( - run_workflow as run_extract_graph, -) -from graphrag.index.workflows.v1.extract_graph import ( - workflow_name as extract_graph, -) -from graphrag.index.workflows.v1.generate_text_embeddings import ( - build_steps as build_generate_text_embeddings_steps, -) -from graphrag.index.workflows.v1.generate_text_embeddings import ( - run_workflow as run_generate_text_embeddings, -) -from graphrag.index.workflows.v1.generate_text_embeddings import ( - workflow_name as generate_text_embeddings, -) - -default_workflows: WorkflowDefinitions = { - extract_graph: build_extract_graph_steps, - compute_communities: build_compute_communities_steps, - create_base_text_units: build_create_base_text_units_steps, - create_final_text_units: build_create_final_text_units, - create_final_community_reports: build_create_final_community_reports_steps, - create_final_nodes: build_create_final_nodes_steps, - create_final_relationships: build_create_final_relationships_steps, - create_final_documents: build_create_final_documents_steps, - create_final_covariates: build_create_final_covariates_steps, - create_final_entities: build_create_final_entities_steps, - create_final_communities: build_create_final_communities_steps, - generate_text_embeddings: build_generate_text_embeddings_steps, -} - -basic_workflows: dict[ - str, - Callable[ - [GraphRagConfig, PipelineRunContext, VerbCallbacks], - Awaitable[pd.DataFrame | None], - ], -] = { - compute_communities: run_compute_communities, - create_base_text_units: run_create_base_text_units, - create_final_communities: run_create_final_communities, - create_final_community_reports: run_create_final_community_reports, - create_final_covariates: run_create_final_covariates, - create_final_documents: run_create_final_documents, - create_final_entities: run_create_final_entities, - create_final_nodes: run_create_final_nodes, - create_final_relationships: run_create_final_relationships, - create_final_text_units: run_create_final_text_units, - extract_graph: run_extract_graph, - generate_text_embeddings: run_generate_text_embeddings, -} diff --git a/graphrag/index/workflows/extract_graph.py b/graphrag/index/workflows/extract_graph.py new file mode 100644 index 0000000000..454bf7806a --- /dev/null +++ b/graphrag/index/workflows/extract_graph.py @@ -0,0 +1,71 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing run_workflow method definition.""" + +import pandas as pd + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.models.graph_rag_config import GraphRagConfig +from graphrag.index.context import PipelineRunContext +from graphrag.index.flows.extract_graph import ( + extract_graph, +) +from graphrag.index.operations.create_graph import create_graph +from graphrag.index.operations.snapshot_graphml import snapshot_graphml +from graphrag.utils.storage import load_table_from_storage, write_table_to_storage + +workflow_name = "extract_graph" + + +async def run_workflow( + config: GraphRagConfig, + context: PipelineRunContext, + callbacks: VerbCallbacks, +) -> pd.DataFrame | None: + """All the steps to create the base entity graph.""" + text_units = await load_table_from_storage( + "create_base_text_units", context.storage + ) + + extraction_strategy = config.entity_extraction.resolved_strategy( + config.root_dir, config.encoding_model + ) + extraction_num_threads = config.entity_extraction.parallelization.num_threads + extraction_async_mode = config.entity_extraction.async_mode + entity_types = config.entity_extraction.entity_types + + summarization_strategy = config.summarize_descriptions.resolved_strategy( + config.root_dir, + ) + summarization_num_threads = ( + config.summarize_descriptions.parallelization.num_threads + ) + + base_entity_nodes, base_relationship_edges = await extract_graph( + text_units, + callbacks, + context.cache, + extraction_strategy=extraction_strategy, + extraction_num_threads=extraction_num_threads, + extraction_async_mode=extraction_async_mode, + entity_types=entity_types, + summarization_strategy=summarization_strategy, + summarization_num_threads=summarization_num_threads, + ) + + await write_table_to_storage( + base_entity_nodes, "base_entity_nodes", context.storage + ) + await write_table_to_storage( + base_relationship_edges, "base_relationship_edges", context.storage + ) + + if config.snapshots.graphml: + # todo: extract graphs at each level, and add in meta like descriptions + graph = create_graph(base_relationship_edges) + await snapshot_graphml( + graph, + name="graph", + storage=context.storage, + ) diff --git a/graphrag/index/workflows/generate_text_embeddings.py b/graphrag/index/workflows/generate_text_embeddings.py new file mode 100644 index 0000000000..29a8bf0988 --- /dev/null +++ b/graphrag/index/workflows/generate_text_embeddings.py @@ -0,0 +1,57 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing run_workflow method definition.""" + +import pandas as pd + +from graphrag.callbacks.verb_callbacks import VerbCallbacks +from graphrag.config.models.graph_rag_config import GraphRagConfig +from graphrag.index.config.embeddings import get_embedded_fields, get_embedding_settings +from graphrag.index.context import PipelineRunContext +from graphrag.index.flows.generate_text_embeddings import ( + generate_text_embeddings, +) +from graphrag.utils.storage import load_table_from_storage + +workflow_name = "generate_text_embeddings" + + +async def run_workflow( + config: GraphRagConfig, + context: PipelineRunContext, + callbacks: VerbCallbacks, +) -> pd.DataFrame | None: + """All the steps to transform community reports.""" + final_documents = await load_table_from_storage( + "create_final_documents", context.storage + ) + final_relationships = await load_table_from_storage( + "create_final_relationships", context.storage + ) + final_text_units = await load_table_from_storage( + "create_final_text_units", context.storage + ) + final_entities = await load_table_from_storage( + "create_final_entities", context.storage + ) + final_community_reports = await load_table_from_storage( + "create_final_community_reports", context.storage + ) + + embedded_fields = get_embedded_fields(config) + text_embed = get_embedding_settings(config.embeddings) + + await generate_text_embeddings( + final_documents=final_documents, + final_relationships=final_relationships, + final_text_units=final_text_units, + final_entities=final_entities, + final_community_reports=final_community_reports, + callbacks=callbacks, + cache=context.cache, + storage=context.storage, + text_embed_config=text_embed, + embedded_fields=embedded_fields, + snapshot_embeddings_enabled=config.snapshots.embeddings, + ) diff --git a/graphrag/index/workflows/load.py b/graphrag/index/workflows/load.py deleted file mode 100644 index 5aa874ecba..0000000000 --- a/graphrag/index/workflows/load.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing load_workflows, create_workflow, _get_steps_for_workflow and _remove_disabled_steps methods definition.""" - -from __future__ import annotations - -import logging -from collections.abc import Callable -from typing import TYPE_CHECKING, Any, NamedTuple, cast - -from datashaper import Workflow - -from graphrag.index.errors import ( - NoWorkflowsDefinedError, - UndefinedWorkflowError, - UnknownWorkflowError, -) -from graphrag.index.utils.topological_sort import topological_sort -from graphrag.index.workflows.default_workflows import ( - default_workflows as _default_workflows, -) -from graphrag.index.workflows.typing import ( - VerbDefinitions, - WorkflowDefinitions, - WorkflowToRun, -) - -if TYPE_CHECKING: - from graphrag.index.config.workflow import ( - PipelineWorkflowConfig, - PipelineWorkflowReference, - PipelineWorkflowStep, - ) - -anonymous_workflow_count = 0 - -VerbFn = Callable[..., Any] -log = logging.getLogger(__name__) - - -class LoadWorkflowResult(NamedTuple): - """A workflow loading result object.""" - - workflows: list[WorkflowToRun] - """The loaded workflow names in the order they should be run.""" - - dependencies: dict[str, list[str]] - """A dictionary of workflow name to workflow dependencies.""" - - -def load_workflows( - workflows_to_load: list[PipelineWorkflowReference], - additional_verbs: VerbDefinitions | None = None, - additional_workflows: WorkflowDefinitions | None = None, - memory_profile: bool = False, -) -> LoadWorkflowResult: - """Load the given workflows. - - Args: - - workflows_to_load - The workflows to load - - additional_verbs - The list of custom verbs available to the workflows - - additional_workflows - The list of custom workflows - Returns: - - output[0] - The loaded workflow names in the order they should be run - - output[1] - A dictionary of workflow name to workflow dependencies - """ - workflow_graph: dict[str, WorkflowToRun] = {} - - global anonymous_workflow_count - for reference in workflows_to_load: - name = reference.name - is_anonymous = name is None or name.strip() == "" - if is_anonymous: - name = f"Anonymous Workflow {anonymous_workflow_count}" - anonymous_workflow_count += 1 - name = cast("str", name) - - config = reference.config - workflow = create_workflow( - name or "MISSING NAME!", - reference.steps, - config, - additional_verbs, - additional_workflows, - ) - workflow_graph[name] = WorkflowToRun(workflow, config=config or {}) - - # Backfill any missing workflows - for name in list(workflow_graph.keys()): - workflow = workflow_graph[name] - deps = [ - d.replace("workflow:", "") - for d in workflow.workflow.dependencies - if d.startswith("workflow:") - ] - for dependency in deps: - if dependency not in workflow_graph: - reference = {"name": dependency, **workflow.config} - workflow_graph[dependency] = WorkflowToRun( - workflow=create_workflow( - dependency, - config=reference, - additional_verbs=additional_verbs, - additional_workflows=additional_workflows, - memory_profile=memory_profile, - ), - config=reference, - ) - - # Run workflows in order of dependencies - def filter_wf_dependencies(name: str) -> list[str]: - externals = [ - e.replace("workflow:", "") - for e in workflow_graph[name].workflow.dependencies - ] - return [e for e in externals if e in workflow_graph] - - task_graph = {name: filter_wf_dependencies(name) for name in workflow_graph} - workflow_run_order = topological_sort(task_graph) - workflows = [workflow_graph[name] for name in workflow_run_order] - log.info("Workflow Run Order: %s", workflow_run_order) - return LoadWorkflowResult(workflows=workflows, dependencies=task_graph) - - -def create_workflow( - name: str, - steps: list[PipelineWorkflowStep] | None = None, - config: PipelineWorkflowConfig | None = None, - additional_verbs: VerbDefinitions | None = None, - additional_workflows: WorkflowDefinitions | None = None, - memory_profile: bool = False, -) -> Workflow: - """Create a workflow from the given config.""" - additional_workflows = { - **_default_workflows, - **(additional_workflows or {}), - } - steps = steps or _get_steps_for_workflow(name, config, additional_workflows) - return Workflow( - verbs=additional_verbs or {}, - schema={ - "name": name, - "steps": steps, - }, - validate=False, - memory_profile=memory_profile, - ) - - -def _get_steps_for_workflow( - name: str | None, - config: PipelineWorkflowConfig | None, - workflows: dict[str, Callable] | None, -) -> list[PipelineWorkflowStep]: - """Get the steps for the given workflow config.""" - if config is not None and "steps" in config: - return config["steps"] - - if workflows is None: - raise NoWorkflowsDefinedError - - if name is None: - raise UndefinedWorkflowError - - if name not in workflows: - raise UnknownWorkflowError(name) - - return workflows[name](config or {}) diff --git a/graphrag/index/workflows/typing.py b/graphrag/index/workflows/typing.py deleted file mode 100644 index 3b44545bd4..0000000000 --- a/graphrag/index/workflows/typing.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing 'WorkflowToRun' model.""" - -from collections.abc import Callable -from dataclasses import dataclass as dc_dataclass -from typing import Any - -from datashaper import TableContainer, Workflow - -StepDefinition = dict[str, Any] -"""A step definition.""" - -VerbDefinitions = dict[str, Callable[..., TableContainer]] -"""A mapping of verb names to their implementations.""" - -WorkflowConfig = dict[str, Any] -"""A workflow configuration.""" - -WorkflowDefinitions = dict[str, Callable[[WorkflowConfig], list[StepDefinition]]] -"""A mapping of workflow names to their implementations.""" - -VerbTiming = dict[str, float] -"""The timings of verbs by id.""" - - -@dc_dataclass -class WorkflowToRun: - """Workflow to run class definition.""" - - workflow: Workflow - config: dict[str, Any] diff --git a/graphrag/index/workflows/v1/__init__.py b/graphrag/index/workflows/v1/__init__.py deleted file mode 100644 index 69518f5ee2..0000000000 --- a/graphrag/index/workflows/v1/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine workflows package root.""" diff --git a/graphrag/index/workflows/v1/compute_communities.py b/graphrag/index/workflows/v1/compute_communities.py deleted file mode 100644 index 1400263d6c..0000000000 --- a/graphrag/index/workflows/v1/compute_communities.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing build_steps method definition.""" - -from typing import TYPE_CHECKING, cast - -import pandas as pd -from datashaper import ( - Table, - VerbCallbacks, - verb, -) -from datashaper.table_store.types import VerbResult, create_verb_result - -from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.config.workflow import PipelineWorkflowConfig, PipelineWorkflowStep -from graphrag.index.context import PipelineRunContext -from graphrag.index.flows.compute_communities import compute_communities -from graphrag.index.operations.snapshot import snapshot -from graphrag.storage.pipeline_storage import PipelineStorage - -if TYPE_CHECKING: - from graphrag.config.models.cluster_graph_config import ClusterGraphConfig - -workflow_name = "compute_communities" - - -def build_steps( - config: PipelineWorkflowConfig, -) -> list[PipelineWorkflowStep]: - """ - Create the base communities from the graph edges. - - ## Dependencies - * `workflow:extract_graph` - """ - clustering_config = cast("ClusterGraphConfig", config.get("cluster_graph")) - max_cluster_size = clustering_config.max_cluster_size - use_lcc = clustering_config.use_lcc - seed = clustering_config.seed - - snapshot_transient = config.get("snapshot_transient", False) or False - - return [ - { - "verb": workflow_name, - "args": { - "max_cluster_size": max_cluster_size, - "use_lcc": use_lcc, - "seed": seed, - "snapshot_transient_enabled": snapshot_transient, - }, - "input": ({"source": "workflow:extract_graph"}), - }, - ] - - -@verb( - name=workflow_name, - treats_input_tables_as_immutable=True, -) -async def workflow( - storage: PipelineStorage, - runtime_storage: PipelineStorage, - max_cluster_size: int, - use_lcc: bool, - seed: int | None, - snapshot_transient_enabled: bool = False, - **_kwargs: dict, -) -> VerbResult: - """All the steps to create the base communities.""" - base_relationship_edges = await runtime_storage.get("base_relationship_edges") - - base_communities = compute_communities( - base_relationship_edges, - max_cluster_size=max_cluster_size, - use_lcc=use_lcc, - seed=seed, - ) - - await runtime_storage.set("base_communities", base_communities) - - if snapshot_transient_enabled: - await snapshot( - base_communities, - name="base_communities", - storage=storage, - formats=["parquet"], - ) - - return create_verb_result(cast("Table", pd.DataFrame())) - - -async def run_workflow( - config: GraphRagConfig, - context: PipelineRunContext, - _callbacks: VerbCallbacks, -) -> pd.DataFrame | None: - """All the steps to create the base communities.""" - base_relationship_edges = await context.runtime_storage.get( - "base_relationship_edges" - ) - - max_cluster_size = config.cluster_graph.max_cluster_size - use_lcc = config.cluster_graph.use_lcc - seed = config.cluster_graph.seed - - base_communities = compute_communities( - base_relationship_edges, - max_cluster_size=max_cluster_size, - use_lcc=use_lcc, - seed=seed, - ) - - await context.runtime_storage.set("base_communities", base_communities) - - if config.snapshots.transient: - await snapshot( - base_communities, - name="base_communities", - storage=context.storage, - formats=["parquet"], - ) - - return base_communities diff --git a/graphrag/index/workflows/v1/create_base_text_units.py b/graphrag/index/workflows/v1/create_base_text_units.py deleted file mode 100644 index 5ab24e175a..0000000000 --- a/graphrag/index/workflows/v1/create_base_text_units.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing build_steps method definition.""" - -from typing import TYPE_CHECKING, cast - -import pandas as pd -from datashaper import ( - DEFAULT_INPUT_NAME, - Table, - VerbCallbacks, - VerbInput, - verb, -) -from datashaper.table_store.types import VerbResult, create_verb_result - -from graphrag.config.models.chunking_config import ChunkStrategyType -from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.config.workflow import PipelineWorkflowConfig, PipelineWorkflowStep -from graphrag.index.context import PipelineRunContext -from graphrag.index.flows.create_base_text_units import ( - create_base_text_units, -) -from graphrag.index.operations.snapshot import snapshot -from graphrag.storage.pipeline_storage import PipelineStorage - -if TYPE_CHECKING: - from graphrag.config.models.chunking_config import ChunkingConfig - -workflow_name = "create_base_text_units" - - -def build_steps( - config: PipelineWorkflowConfig, -) -> list[PipelineWorkflowStep]: - """ - Create the base table for text units. - - ## Dependencies - (input dataframe) - """ - chunks = cast("ChunkingConfig", config.get("chunks")) - group_by_columns = chunks.group_by_columns - size = chunks.size - overlap = chunks.overlap - encoding_model = chunks.encoding_model - strategy = chunks.strategy - - snapshot_transient = config.get("snapshot_transient", False) or False - return [ - { - "verb": workflow_name, - "args": { - "group_by_columns": group_by_columns, - "size": size, - "overlap": overlap, - "encoding_model": encoding_model, - "strategy": strategy, - "snapshot_transient_enabled": snapshot_transient, - }, - "input": {"source": DEFAULT_INPUT_NAME}, - }, - ] - - -@verb(name=workflow_name, treats_input_tables_as_immutable=True) -async def workflow( - input: VerbInput, - callbacks: VerbCallbacks, - storage: PipelineStorage, - runtime_storage: PipelineStorage, - group_by_columns: list[str], - size: int, - overlap: int, - encoding_model: str, - strategy: ChunkStrategyType, - snapshot_transient_enabled: bool = False, - **_kwargs: dict, -) -> VerbResult: - """All the steps to transform base text_units.""" - source = cast("pd.DataFrame", input.get_input()) - - output = create_base_text_units( - source, - callbacks, - group_by_columns, - size, - overlap, - encoding_model, - strategy=strategy, - ) - - await runtime_storage.set("base_text_units", output) - - if snapshot_transient_enabled: - await snapshot( - output, - name="create_base_text_units", - storage=storage, - formats=["parquet"], - ) - - return create_verb_result( - cast( - "Table", - pd.DataFrame(), - ) - ) - - -async def run_workflow( - config: GraphRagConfig, - context: PipelineRunContext, - callbacks: VerbCallbacks, -) -> pd.DataFrame | None: - """All the steps to transform base text_units.""" - documents = await context.runtime_storage.get("input") - - chunks = config.chunks - - output = create_base_text_units( - documents, - callbacks, - chunks.group_by_columns, - chunks.size, - chunks.overlap, - chunks.encoding_model, - strategy=chunks.strategy, - ) - - await context.runtime_storage.set("base_text_units", output) - - if config.snapshots.transient: - await snapshot( - output, - name="create_base_text_units", - storage=context.storage, - formats=["parquet"], - ) - - return output diff --git a/graphrag/index/workflows/v1/create_final_communities.py b/graphrag/index/workflows/v1/create_final_communities.py deleted file mode 100644 index 2dfcd2ceaf..0000000000 --- a/graphrag/index/workflows/v1/create_final_communities.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing build_steps method definition.""" - -from typing import cast - -import pandas as pd -from datashaper import ( - Table, - VerbCallbacks, - verb, -) -from datashaper.table_store.types import VerbResult, create_verb_result - -from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.config.workflow import PipelineWorkflowConfig, PipelineWorkflowStep -from graphrag.index.context import PipelineRunContext -from graphrag.index.flows.create_final_communities import ( - create_final_communities, -) -from graphrag.index.operations.snapshot import snapshot -from graphrag.storage.pipeline_storage import PipelineStorage - -workflow_name = "create_final_communities" - - -def build_steps( - _config: PipelineWorkflowConfig, -) -> list[PipelineWorkflowStep]: - """ - Create the final communities table. - - ## Dependencies - * `workflow:extract_graph` - """ - return [ - { - "verb": workflow_name, - "input": {"source": "workflow:extract_graph"}, - }, - ] - - -@verb(name=workflow_name, treats_input_tables_as_immutable=True) -async def workflow( - runtime_storage: PipelineStorage, - **_kwargs: dict, -) -> VerbResult: - """All the steps to transform final communities.""" - base_entity_nodes = await runtime_storage.get("base_entity_nodes") - base_relationship_edges = await runtime_storage.get("base_relationship_edges") - base_communities = await runtime_storage.get("base_communities") - output = create_final_communities( - base_entity_nodes, - base_relationship_edges, - base_communities, - ) - - return create_verb_result( - cast( - "Table", - output, - ) - ) - - -async def run_workflow( - _config: GraphRagConfig, - context: PipelineRunContext, - _callbacks: VerbCallbacks, -) -> pd.DataFrame | None: - """All the steps to transform final communities.""" - base_entity_nodes = await context.runtime_storage.get("base_entity_nodes") - base_relationship_edges = await context.runtime_storage.get( - "base_relationship_edges" - ) - base_communities = await context.runtime_storage.get("base_communities") - output = create_final_communities( - base_entity_nodes, - base_relationship_edges, - base_communities, - ) - - await snapshot( - output, - name="create_final_communities", - storage=context.storage, - formats=["parquet"], - ) - - return output diff --git a/graphrag/index/workflows/v1/create_final_community_reports.py b/graphrag/index/workflows/v1/create_final_community_reports.py deleted file mode 100644 index fcd8743c68..0000000000 --- a/graphrag/index/workflows/v1/create_final_community_reports.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing build_steps method definition.""" - -from typing import cast - -import pandas as pd -from datashaper import ( - AsyncType, - Table, - VerbCallbacks, - VerbInput, - verb, -) -from datashaper.table_store.types import VerbResult, create_verb_result - -from graphrag.cache.pipeline_cache import PipelineCache -from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.config.workflow import PipelineWorkflowConfig, PipelineWorkflowStep -from graphrag.index.context import PipelineRunContext -from graphrag.index.flows.create_final_community_reports import ( - create_final_community_reports, -) -from graphrag.index.operations.snapshot import snapshot -from graphrag.index.utils.ds_util import get_named_input_table, get_required_input_table -from graphrag.utils.storage import load_table_from_storage - -workflow_name = "create_final_community_reports" - - -def build_steps( - config: PipelineWorkflowConfig, -) -> list[PipelineWorkflowStep]: - """ - Create the final community reports table. - - ## Dependencies - * `workflow:extract_graph` - """ - covariates_enabled = config.get("covariates_enabled", False) - create_community_reports_config = config.get("create_community_reports", {}) - summarization_strategy = create_community_reports_config.get("strategy") - async_mode = create_community_reports_config.get("async_mode") - num_threads = create_community_reports_config.get("num_threads") - - input = { - "source": "workflow:create_final_nodes", - "relationships": "workflow:create_final_relationships", - "entities": "workflow:create_final_entities", - "communities": "workflow:create_final_communities", - } - if covariates_enabled: - input["covariates"] = "workflow:create_final_covariates" - - return [ - { - "verb": workflow_name, - "args": { - "summarization_strategy": summarization_strategy, - "async_mode": async_mode, - "num_threads": num_threads, - }, - "input": input, - }, - ] - - -@verb(name=workflow_name, treats_input_tables_as_immutable=True) -async def workflow( - input: VerbInput, - callbacks: VerbCallbacks, - cache: PipelineCache, - summarization_strategy: dict, - async_mode: AsyncType = AsyncType.AsyncIO, - num_threads: int = 4, - **_kwargs: dict, -) -> VerbResult: - """All the steps to transform community reports.""" - nodes = cast("pd.DataFrame", input.get_input()) - edges = cast("pd.DataFrame", get_required_input_table(input, "relationships").table) - entities = cast("pd.DataFrame", get_required_input_table(input, "entities").table) - communities = cast( - "pd.DataFrame", get_required_input_table(input, "communities").table - ) - - claims = get_named_input_table(input, "covariates") - if claims: - claims = cast("pd.DataFrame", claims.table) - - output = await create_final_community_reports( - nodes, - edges, - entities, - communities, - claims, - callbacks, - cache, - summarization_strategy, - async_mode=async_mode, - num_threads=num_threads, - ) - - return create_verb_result( - cast( - "Table", - output, - ) - ) - - -async def run_workflow( - config: GraphRagConfig, - context: PipelineRunContext, - callbacks: VerbCallbacks, -) -> pd.DataFrame | None: - """All the steps to transform community reports.""" - nodes = await load_table_from_storage("create_final_nodes.parquet", context.storage) - edges = await load_table_from_storage( - "create_final_relationships.parquet", context.storage - ) - entities = await load_table_from_storage( - "create_final_entities.parquet", context.storage - ) - communities = await load_table_from_storage( - "create_final_communities.parquet", context.storage - ) - claims = await load_table_from_storage( - "create_final_covariates.parquet", context.storage - ) - - async_mode = config.community_reports.async_mode - num_threads = config.community_reports.parallelization.num_threads - summarization_strategy = config.community_reports.resolved_strategy(config.root_dir) - - output = await create_final_community_reports( - nodes, - edges, - entities, - communities, - claims, - callbacks, - context.cache, - summarization_strategy, - async_mode=async_mode, - num_threads=num_threads, - ) - - await snapshot( - output, - name="create_final_community_reports", - storage=context.storage, - formats=["parquet"], - ) - - return output diff --git a/graphrag/index/workflows/v1/create_final_covariates.py b/graphrag/index/workflows/v1/create_final_covariates.py deleted file mode 100644 index 7eb606ae54..0000000000 --- a/graphrag/index/workflows/v1/create_final_covariates.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing build_steps method definition.""" - -from typing import Any, cast - -import pandas as pd -from datashaper import ( - AsyncType, - Table, - VerbCallbacks, - verb, -) -from datashaper.table_store.types import VerbResult, create_verb_result - -from graphrag.cache.pipeline_cache import PipelineCache -from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.config.workflow import PipelineWorkflowConfig, PipelineWorkflowStep -from graphrag.index.context import PipelineRunContext -from graphrag.index.flows.create_final_covariates import ( - create_final_covariates, -) -from graphrag.index.operations.snapshot import snapshot -from graphrag.storage.pipeline_storage import PipelineStorage - -workflow_name = "create_final_covariates" - - -def build_steps( - config: PipelineWorkflowConfig, -) -> list[PipelineWorkflowStep]: - """ - Create the final covariates table. - - ## Dependencies - * `workflow:create_base_text_units` - """ - claim_extract_config = config.get("claim_extract", {}) - extraction_strategy = claim_extract_config.get("strategy") - async_mode = claim_extract_config.get("async_mode", AsyncType.AsyncIO) - num_threads = claim_extract_config.get("num_threads") - - return [ - { - "verb": workflow_name, - "args": { - "covariate_type": "claim", - "extraction_strategy": extraction_strategy, - "async_mode": async_mode, - "num_threads": num_threads, - }, - "input": {"source": "workflow:create_base_text_units"}, - }, - ] - - -@verb(name=workflow_name, treats_input_tables_as_immutable=True) -async def workflow( - callbacks: VerbCallbacks, - cache: PipelineCache, - runtime_storage: PipelineStorage, - covariate_type: str, - extraction_strategy: dict[str, Any] | None, - async_mode: AsyncType = AsyncType.AsyncIO, - entity_types: list[str] | None = None, - num_threads: int = 4, - **_kwargs: dict, -) -> VerbResult: - """All the steps to extract and format covariates.""" - text_units = await runtime_storage.get("base_text_units") - - output = await create_final_covariates( - text_units, - callbacks, - cache, - covariate_type, - extraction_strategy, - async_mode=async_mode, - entity_types=entity_types, - num_threads=num_threads, - ) - - return create_verb_result(cast("Table", output)) - - -async def run_workflow( - config: GraphRagConfig, - context: PipelineRunContext, - callbacks: VerbCallbacks, -) -> pd.DataFrame | None: - """All the steps to extract and format covariates.""" - text_units = await context.runtime_storage.get("base_text_units") - - extraction_strategy = config.claim_extraction.resolved_strategy( - config.root_dir, config.encoding_model - ) - - async_mode = config.claim_extraction.async_mode - num_threads = config.claim_extraction.parallelization.num_threads - - output = await create_final_covariates( - text_units, - callbacks, - context.cache, - "claim", - extraction_strategy, - async_mode=async_mode, - entity_types=None, - num_threads=num_threads, - ) - - await snapshot( - output, - name="create_final_covariates", - storage=context.storage, - formats=["parquet"], - ) - - return output diff --git a/graphrag/index/workflows/v1/create_final_documents.py b/graphrag/index/workflows/v1/create_final_documents.py deleted file mode 100644 index f570d5e473..0000000000 --- a/graphrag/index/workflows/v1/create_final_documents.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing build_steps method definition.""" - -from typing import cast - -import pandas as pd -from datashaper import ( - DEFAULT_INPUT_NAME, - Table, - VerbCallbacks, - VerbInput, - verb, -) -from datashaper.table_store.types import VerbResult, create_verb_result - -from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.config.workflow import PipelineWorkflowConfig, PipelineWorkflowStep -from graphrag.index.context import PipelineRunContext -from graphrag.index.flows.create_final_documents import ( - create_final_documents, -) -from graphrag.index.operations.snapshot import snapshot -from graphrag.storage.pipeline_storage import PipelineStorage - -workflow_name = "create_final_documents" - - -def build_steps( - config: PipelineWorkflowConfig, -) -> list[PipelineWorkflowStep]: - """ - Create the final documents table. - - ## Dependencies - * `workflow:create_base_text_units` - """ - document_attribute_columns = config.get("document_attribute_columns", None) - return [ - { - "verb": workflow_name, - "args": {"document_attribute_columns": document_attribute_columns}, - "input": { - "source": DEFAULT_INPUT_NAME, - "text_units": "workflow:create_base_text_units", - }, - }, - ] - - -@verb( - name=workflow_name, - treats_input_tables_as_immutable=True, -) -async def workflow( - input: VerbInput, - runtime_storage: PipelineStorage, - document_attribute_columns: list[str] | None = None, - **_kwargs: dict, -) -> VerbResult: - """All the steps to transform final documents.""" - source = cast("pd.DataFrame", input.get_input()) - text_units = await runtime_storage.get("base_text_units") - - output = create_final_documents(source, text_units, document_attribute_columns) - - return create_verb_result(cast("Table", output)) - - -async def run_workflow( - config: GraphRagConfig, - context: PipelineRunContext, - _callbacks: VerbCallbacks, -) -> pd.DataFrame | None: - """All the steps to transform final documents.""" - documents = await context.runtime_storage.get("input") - text_units = await context.runtime_storage.get("base_text_units") - - input = config.input - output = create_final_documents( - documents, text_units, input.document_attribute_columns - ) - - await snapshot( - output, - name="create_final_documents", - storage=context.storage, - formats=["parquet"], - ) - - return output diff --git a/graphrag/index/workflows/v1/create_final_entities.py b/graphrag/index/workflows/v1/create_final_entities.py deleted file mode 100644 index 38e6ee88ee..0000000000 --- a/graphrag/index/workflows/v1/create_final_entities.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing build_steps method definition.""" - -import logging -from typing import cast - -import pandas as pd -from datashaper import ( - Table, - VerbCallbacks, - verb, -) -from datashaper.table_store.types import VerbResult, create_verb_result - -from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.config.workflow import PipelineWorkflowConfig, PipelineWorkflowStep -from graphrag.index.context import PipelineRunContext -from graphrag.index.flows.create_final_entities import ( - create_final_entities, -) -from graphrag.index.operations.snapshot import snapshot -from graphrag.storage.pipeline_storage import PipelineStorage - -workflow_name = "create_final_entities" -log = logging.getLogger(__name__) - - -def build_steps( - config: PipelineWorkflowConfig, # noqa: ARG001 -) -> list[PipelineWorkflowStep]: - """ - Create the final entities table. - - ## Dependencies - * `workflow:extract_graph` - """ - return [ - { - "verb": workflow_name, - "args": {}, - "input": {"source": "workflow:extract_graph"}, - }, - ] - - -@verb( - name=workflow_name, - treats_input_tables_as_immutable=True, -) -async def workflow( - runtime_storage: PipelineStorage, - **_kwargs: dict, -) -> VerbResult: - """All the steps to transform final entities.""" - base_entity_nodes = await runtime_storage.get("base_entity_nodes") - - output = create_final_entities(base_entity_nodes) - - return create_verb_result(cast("Table", output)) - - -async def run_workflow( - _config: GraphRagConfig, - context: PipelineRunContext, - _callbacks: VerbCallbacks, -) -> pd.DataFrame | None: - """All the steps to transform final entities.""" - base_entity_nodes = await context.runtime_storage.get("base_entity_nodes") - - output = create_final_entities(base_entity_nodes) - - await snapshot( - output, - name="create_final_entities", - storage=context.storage, - formats=["parquet"], - ) - - return output diff --git a/graphrag/index/workflows/v1/create_final_nodes.py b/graphrag/index/workflows/v1/create_final_nodes.py deleted file mode 100644 index 4dc9b49d3f..0000000000 --- a/graphrag/index/workflows/v1/create_final_nodes.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing build_steps method definition.""" - -from typing import cast - -import pandas as pd -from datashaper import ( - Table, - VerbCallbacks, - verb, -) -from datashaper.table_store.types import VerbResult, create_verb_result - -from graphrag.config.models.embed_graph_config import EmbedGraphConfig -from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.config.workflow import PipelineWorkflowConfig, PipelineWorkflowStep -from graphrag.index.context import PipelineRunContext -from graphrag.index.flows.create_final_nodes import ( - create_final_nodes, -) -from graphrag.index.operations.snapshot import snapshot -from graphrag.storage.pipeline_storage import PipelineStorage - -workflow_name = "create_final_nodes" - - -def build_steps( - config: PipelineWorkflowConfig, -) -> list[PipelineWorkflowStep]: - """ - Create the base table for the document graph. - - ## Dependencies - * `workflow:extract_graph` - """ - layout_enabled = config["layout_enabled"] - embed_config = cast("EmbedGraphConfig", config["embed_graph"]) - - return [ - { - "verb": workflow_name, - "args": {"layout_enabled": layout_enabled, "embed_config": embed_config}, - "input": { - "source": "workflow:extract_graph", - "communities": "workflow:compute_communities", - }, - }, - ] - - -@verb(name=workflow_name, treats_input_tables_as_immutable=True) -async def workflow( - callbacks: VerbCallbacks, - runtime_storage: PipelineStorage, - embed_config: EmbedGraphConfig, - layout_enabled: bool, - **_kwargs: dict, -) -> VerbResult: - """All the steps to transform final nodes.""" - base_entity_nodes = await runtime_storage.get("base_entity_nodes") - base_relationship_edges = await runtime_storage.get("base_relationship_edges") - base_communities = await runtime_storage.get("base_communities") - - output = create_final_nodes( - base_entity_nodes, - base_relationship_edges, - base_communities, - callbacks, - embed_config=embed_config, - layout_enabled=layout_enabled, - ) - - return create_verb_result( - cast( - "Table", - output, - ) - ) - - -async def run_workflow( - config: GraphRagConfig, - context: PipelineRunContext, - callbacks: VerbCallbacks, -) -> pd.DataFrame | None: - """All the steps to transform final nodes.""" - base_entity_nodes = await context.runtime_storage.get("base_entity_nodes") - base_relationship_edges = await context.runtime_storage.get( - "base_relationship_edges" - ) - base_communities = await context.runtime_storage.get("base_communities") - - embed_config = config.embed_graph - layout_enabled = config.umap.enabled - - output = create_final_nodes( - base_entity_nodes, - base_relationship_edges, - base_communities, - callbacks, - embed_config=embed_config, - layout_enabled=layout_enabled, - ) - await snapshot( - output, - name="create_final_nodes", - storage=context.storage, - formats=["parquet"], - ) - - return output diff --git a/graphrag/index/workflows/v1/create_final_relationships.py b/graphrag/index/workflows/v1/create_final_relationships.py deleted file mode 100644 index e7c803a0bf..0000000000 --- a/graphrag/index/workflows/v1/create_final_relationships.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing build_steps method definition.""" - -import logging -from typing import cast - -import pandas as pd -from datashaper import ( - Table, - VerbCallbacks, - verb, -) -from datashaper.table_store.types import VerbResult, create_verb_result - -from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.config.workflow import PipelineWorkflowConfig, PipelineWorkflowStep -from graphrag.index.context import PipelineRunContext -from graphrag.index.flows.create_final_relationships import ( - create_final_relationships, -) -from graphrag.index.operations.snapshot import snapshot -from graphrag.storage.pipeline_storage import PipelineStorage - -workflow_name = "create_final_relationships" - -log = logging.getLogger(__name__) - - -def build_steps( - config: PipelineWorkflowConfig, # noqa: ARG001 -) -> list[PipelineWorkflowStep]: - """ - Create the final relationships table. - - ## Dependencies - * `workflow:extract_graph` - """ - return [ - { - "verb": workflow_name, - "args": {}, - "input": { - "source": "workflow:extract_graph", - }, - }, - ] - - -@verb( - name=workflow_name, - treats_input_tables_as_immutable=True, -) -async def workflow( - runtime_storage: PipelineStorage, - **_kwargs: dict, -) -> VerbResult: - """All the steps to transform final relationships.""" - base_relationship_edges = await runtime_storage.get("base_relationship_edges") - - output = create_final_relationships(base_relationship_edges) - - return create_verb_result(cast("Table", output)) - - -async def run_workflow( - _config: GraphRagConfig, - context: PipelineRunContext, - _callbacks: VerbCallbacks, -) -> pd.DataFrame | None: - """All the steps to transform final relationships.""" - base_relationship_edges = await context.runtime_storage.get( - "base_relationship_edges" - ) - - output = create_final_relationships(base_relationship_edges) - - await snapshot( - output, - name="create_final_relationships", - storage=context.storage, - formats=["parquet"], - ) - - return output diff --git a/graphrag/index/workflows/v1/create_final_text_units.py b/graphrag/index/workflows/v1/create_final_text_units.py deleted file mode 100644 index 578e1ecbb0..0000000000 --- a/graphrag/index/workflows/v1/create_final_text_units.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing build_steps method definition.""" - -from typing import cast - -import pandas as pd -from datashaper import ( - Table, - VerbCallbacks, - VerbInput, - VerbResult, - create_verb_result, - verb, -) - -from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.config.workflow import PipelineWorkflowConfig, PipelineWorkflowStep -from graphrag.index.context import PipelineRunContext -from graphrag.index.flows.create_final_text_units import ( - create_final_text_units, -) -from graphrag.index.operations.snapshot import snapshot -from graphrag.index.utils.ds_util import get_named_input_table, get_required_input_table -from graphrag.storage.pipeline_storage import PipelineStorage -from graphrag.utils.storage import load_table_from_storage - -workflow_name = "create_final_text_units" - - -def build_steps( - config: PipelineWorkflowConfig, -) -> list[PipelineWorkflowStep]: - """ - Create the final text-units table. - - ## Dependencies - * `workflow:create_base_text_units` - * `workflow:create_final_entities` - * `workflow:create_final_communities` - """ - covariates_enabled = config.get("covariates_enabled", False) - - input = { - "source": "workflow:create_base_text_units", - "entities": "workflow:create_final_entities", - "relationships": "workflow:create_final_relationships", - } - - if covariates_enabled: - input["covariates"] = "workflow:create_final_covariates" - - return [ - { - "verb": workflow_name, - "args": {}, - "input": input, - }, - ] - - -@verb(name=workflow_name, treats_input_tables_as_immutable=True) -async def workflow( - input: VerbInput, - runtime_storage: PipelineStorage, - **_kwargs: dict, -) -> VerbResult: - """All the steps to transform the text units.""" - text_units = await runtime_storage.get("base_text_units") - final_entities = cast( - "pd.DataFrame", get_required_input_table(input, "entities").table - ) - final_relationships = cast( - "pd.DataFrame", get_required_input_table(input, "relationships").table - ) - final_covariates = get_named_input_table(input, "covariates") - - if final_covariates: - final_covariates = cast("pd.DataFrame", final_covariates.table) - - output = create_final_text_units( - text_units, - final_entities, - final_relationships, - final_covariates, - ) - - return create_verb_result(cast("Table", output)) - - -async def run_workflow( - _config: GraphRagConfig, - context: PipelineRunContext, - _callbacks: VerbCallbacks, -) -> pd.DataFrame | None: - """All the steps to transform the text units.""" - text_units = await context.runtime_storage.get("base_text_units") - final_entities = await load_table_from_storage( - "create_final_entities.parquet", context.storage - ) - final_relationships = await load_table_from_storage( - "create_final_relationships.parquet", context.storage - ) - final_covariates = await load_table_from_storage( - "create_final_covariates.parquet", context.storage - ) - - output = create_final_text_units( - text_units, - final_entities, - final_relationships, - final_covariates, - ) - - await snapshot( - output, - name="create_final_text_units", - storage=context.storage, - formats=["parquet"], - ) - - return output diff --git a/graphrag/index/workflows/v1/extract_graph.py b/graphrag/index/workflows/v1/extract_graph.py deleted file mode 100644 index 7644100f22..0000000000 --- a/graphrag/index/workflows/v1/extract_graph.py +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing build_steps method definition.""" - -from typing import Any, cast - -import pandas as pd -from datashaper import ( - AsyncType, - Table, - VerbCallbacks, - verb, -) -from datashaper.table_store.types import VerbResult, create_verb_result - -from graphrag.cache.pipeline_cache import PipelineCache -from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.config.workflow import PipelineWorkflowConfig, PipelineWorkflowStep -from graphrag.index.context import PipelineRunContext -from graphrag.index.flows.extract_graph import ( - extract_graph, -) -from graphrag.index.operations.create_graph import create_graph -from graphrag.index.operations.snapshot import snapshot -from graphrag.index.operations.snapshot_graphml import snapshot_graphml -from graphrag.storage.pipeline_storage import PipelineStorage - -workflow_name = "extract_graph" - - -def build_steps( - config: PipelineWorkflowConfig, -) -> list[PipelineWorkflowStep]: - """ - Create the base table for the entity graph. - - ## Dependencies - * `workflow:create_base_text_units` - """ - entity_extraction_config = config.get("entity_extract", {}) - async_mode = entity_extraction_config.get("async_mode", AsyncType.AsyncIO) - extraction_strategy = entity_extraction_config.get("strategy") - extraction_num_threads = entity_extraction_config.get("num_threads", 4) - entity_types = entity_extraction_config.get("entity_types") - - summarize_descriptions_config = config.get("summarize_descriptions", {}) - summarization_strategy = summarize_descriptions_config.get("strategy") - summarization_num_threads = summarize_descriptions_config.get("num_threads", 4) - - snapshot_graphml = config.get("snapshot_graphml", False) or False - snapshot_transient = config.get("snapshot_transient", False) or False - - return [ - { - "verb": workflow_name, - "args": { - "extraction_strategy": extraction_strategy, - "extraction_num_threads": extraction_num_threads, - "extraction_async_mode": async_mode, - "entity_types": entity_types, - "summarization_strategy": summarization_strategy, - "summarization_num_threads": summarization_num_threads, - "snapshot_graphml_enabled": snapshot_graphml, - "snapshot_transient_enabled": snapshot_transient, - }, - "input": ({"source": "workflow:create_base_text_units"}), - }, - ] - - -@verb( - name=workflow_name, - treats_input_tables_as_immutable=True, -) -async def workflow( - callbacks: VerbCallbacks, - cache: PipelineCache, - storage: PipelineStorage, - runtime_storage: PipelineStorage, - extraction_strategy: dict[str, Any] | None, - extraction_num_threads: int = 4, - extraction_async_mode: AsyncType = AsyncType.AsyncIO, - entity_types: list[str] | None = None, - summarization_strategy: dict[str, Any] | None = None, - summarization_num_threads: int = 4, - snapshot_graphml_enabled: bool = False, - snapshot_transient_enabled: bool = False, - **_kwargs: dict, -) -> VerbResult: - """All the steps to create the base entity graph.""" - text_units = await runtime_storage.get("base_text_units") - - base_entity_nodes, base_relationship_edges = await extract_graph( - text_units, - callbacks, - cache, - extraction_strategy=extraction_strategy, - extraction_num_threads=extraction_num_threads, - extraction_async_mode=extraction_async_mode, - entity_types=entity_types, - summarization_strategy=summarization_strategy, - summarization_num_threads=summarization_num_threads, - ) - - await runtime_storage.set("base_entity_nodes", base_entity_nodes) - await runtime_storage.set("base_relationship_edges", base_relationship_edges) - - if snapshot_graphml_enabled: - # todo: extract graphs at each level, and add in meta like descriptions - graph = create_graph(base_relationship_edges) - await snapshot_graphml( - graph, - name="graph", - storage=storage, - ) - - if snapshot_transient_enabled: - await snapshot( - base_entity_nodes, - name="base_entity_nodes", - storage=storage, - formats=["parquet"], - ) - await snapshot( - base_relationship_edges, - name="base_relationship_edges", - storage=storage, - formats=["parquet"], - ) - - return create_verb_result(cast("Table", pd.DataFrame())) - - -async def run_workflow( - config: GraphRagConfig, - context: PipelineRunContext, - callbacks: VerbCallbacks, -) -> pd.DataFrame | None: - """All the steps to create the base entity graph.""" - text_units = await context.runtime_storage.get("base_text_units") - - extraction_strategy = config.entity_extraction.resolved_strategy( - config.root_dir, config.encoding_model - ) - extraction_num_threads = config.entity_extraction.parallelization.num_threads - extraction_async_mode = config.entity_extraction.async_mode - entity_types = config.entity_extraction.entity_types - - summarization_strategy = config.summarize_descriptions.resolved_strategy( - config.root_dir, - ) - summarization_num_threads = ( - config.summarize_descriptions.parallelization.num_threads - ) - - base_entity_nodes, base_relationship_edges = await extract_graph( - text_units, - callbacks, - context.cache, - extraction_strategy=extraction_strategy, - extraction_num_threads=extraction_num_threads, - extraction_async_mode=extraction_async_mode, - entity_types=entity_types, - summarization_strategy=summarization_strategy, - summarization_num_threads=summarization_num_threads, - ) - - await context.runtime_storage.set("base_entity_nodes", base_entity_nodes) - await context.runtime_storage.set( - "base_relationship_edges", base_relationship_edges - ) - - if config.snapshots.graphml: - # todo: extract graphs at each level, and add in meta like descriptions - graph = create_graph(base_relationship_edges) - await snapshot_graphml( - graph, - name="graph", - storage=context.storage, - ) - - if config.snapshots.transient: - await snapshot( - base_entity_nodes, - name="base_entity_nodes", - storage=context.storage, - formats=["parquet"], - ) - await snapshot( - base_relationship_edges, - name="base_relationship_edges", - storage=context.storage, - formats=["parquet"], - ) diff --git a/graphrag/index/workflows/v1/generate_text_embeddings.py b/graphrag/index/workflows/v1/generate_text_embeddings.py deleted file mode 100644 index 3bb01ad145..0000000000 --- a/graphrag/index/workflows/v1/generate_text_embeddings.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing build_steps method definition.""" - -import logging -from typing import cast - -import pandas as pd -from datashaper import ( - Table, - VerbCallbacks, - VerbInput, - VerbResult, - create_verb_result, - verb, -) - -from graphrag.cache.pipeline_cache import PipelineCache -from graphrag.config.models.graph_rag_config import GraphRagConfig -from graphrag.index.config.embeddings import get_embedded_fields, get_embedding_settings -from graphrag.index.config.workflow import PipelineWorkflowConfig, PipelineWorkflowStep -from graphrag.index.context import PipelineRunContext -from graphrag.index.flows.generate_text_embeddings import ( - generate_text_embeddings, -) -from graphrag.index.utils.ds_util import get_required_input_table -from graphrag.storage.pipeline_storage import PipelineStorage -from graphrag.utils.storage import load_table_from_storage - -log = logging.getLogger(__name__) - -workflow_name = "generate_text_embeddings" - -input = { - "source": "workflow:create_final_documents", - "relationships": "workflow:create_final_relationships", - "text_units": "workflow:create_final_text_units", - "entities": "workflow:create_final_entities", - "community_reports": "workflow:create_final_community_reports", -} - - -def build_steps( - config: PipelineWorkflowConfig, -) -> list[PipelineWorkflowStep]: - """ - Create the final embeddings files. - - ## Dependencies - * `workflow:create_final_documents` - * `workflow:create_final_relationships` - * `workflow:create_final_text_units` - * `workflow:create_final_entities` - * `workflow:create_final_community_reports` - """ - text_embed = config.get("text_embed", {}) - embedded_fields = config.get("embedded_fields", {}) - snapshot_embeddings = config.get("snapshot_embeddings", False) - return [ - { - "verb": workflow_name, - "args": { - "text_embed": text_embed, - "embedded_fields": embedded_fields, - "snapshot_embeddings_enabled": snapshot_embeddings, - }, - "input": input, - }, - ] - - -@verb(name=workflow_name, treats_input_tables_as_immutable=True) -async def workflow( - input: VerbInput, - callbacks: VerbCallbacks, - cache: PipelineCache, - storage: PipelineStorage, - text_embed: dict, - embedded_fields: set[str], - snapshot_embeddings_enabled: bool = False, - **_kwargs: dict, -) -> VerbResult: - """All the steps to generate embeddings.""" - source = cast("pd.DataFrame", input.get_input()) - final_relationships = cast( - "pd.DataFrame", get_required_input_table(input, "relationships").table - ) - final_text_units = cast( - "pd.DataFrame", get_required_input_table(input, "text_units").table - ) - final_entities = cast( - "pd.DataFrame", get_required_input_table(input, "entities").table - ) - - final_community_reports = cast( - "pd.DataFrame", get_required_input_table(input, "community_reports").table - ) - - await generate_text_embeddings( - final_documents=source, - final_relationships=final_relationships, - final_text_units=final_text_units, - final_entities=final_entities, - final_community_reports=final_community_reports, - callbacks=callbacks, - cache=cache, - storage=storage, - text_embed_config=text_embed, - embedded_fields=embedded_fields, - snapshot_embeddings_enabled=snapshot_embeddings_enabled, - ) - - return create_verb_result(cast("Table", pd.DataFrame())) - - -async def run_workflow( - config: GraphRagConfig, - context: PipelineRunContext, - callbacks: VerbCallbacks, -) -> pd.DataFrame | None: - """All the steps to transform community reports.""" - final_documents = await load_table_from_storage( - "create_final_documents.parquet", context.storage - ) - final_relationships = await load_table_from_storage( - "create_final_relationships.parquet", context.storage - ) - final_text_units = await load_table_from_storage( - "create_final_text_units.parquet", context.storage - ) - final_entities = await load_table_from_storage( - "create_final_entities.parquet", context.storage - ) - final_community_reports = await load_table_from_storage( - "create_final_community_reports.parquet", context.storage - ) - - embedded_fields = get_embedded_fields(config) - text_embed = get_embedding_settings(config.embeddings) - - await generate_text_embeddings( - final_documents=final_documents, - final_relationships=final_relationships, - final_text_units=final_text_units, - final_entities=final_entities, - final_community_reports=final_community_reports, - callbacks=callbacks, - cache=context.cache, - storage=context.storage, - text_embed_config=text_embed, - embedded_fields=embedded_fields, - snapshot_embeddings_enabled=config.snapshots.embeddings, - ) diff --git a/graphrag/logger/base.py b/graphrag/logger/base.py index b730668e87..73b5668552 100644 --- a/graphrag/logger/base.py +++ b/graphrag/logger/base.py @@ -6,7 +6,7 @@ from abc import ABC, abstractmethod from typing import Any -from datashaper.progress.types import Progress +from graphrag.logger.progress import Progress class StatusLogger(ABC): diff --git a/graphrag/logger/progress.py b/graphrag/logger/progress.py new file mode 100644 index 0000000000..536786100b --- /dev/null +++ b/graphrag/logger/progress.py @@ -0,0 +1,82 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Progress reporting types.""" + +from collections.abc import Callable, Iterable +from dataclasses import dataclass +from typing import TypeVar + +T = TypeVar("T") + + +@dataclass +class Progress: + """A class representing the progress of a task.""" + + percent: float | None = None + """0 - 1 progress""" + + description: str | None = None + """Description of the progress""" + + total_items: int | None = None + """Total number of items""" + + completed_items: int | None = None + """Number of items completed""" "" + + +ProgressHandler = Callable[[Progress], None] +"""A function to handle progress reports.""" + + +class ProgressTicker: + """A class that emits progress reports incrementally.""" + + _callback: ProgressHandler | None + _num_total: int + _num_complete: int + + def __init__(self, callback: ProgressHandler | None, num_total: int): + self._callback = callback + self._num_total = num_total + self._num_complete = 0 + + def __call__(self, num_ticks: int = 1) -> None: + """Emit progress.""" + self._num_complete += num_ticks + if self._callback is not None: + self._callback( + Progress( + total_items=self._num_total, completed_items=self._num_complete + ) + ) + + def done(self) -> None: + """Mark the progress as done.""" + if self._callback is not None: + self._callback( + Progress(total_items=self._num_total, completed_items=self._num_total) + ) + + +def progress_ticker(callback: ProgressHandler | None, num_total: int) -> ProgressTicker: + """Create a progress ticker.""" + return ProgressTicker(callback, num_total) + + +def progress_iterable( + iterable: Iterable[T], + progress: ProgressHandler | None, + num_total: int | None = None, +) -> Iterable[T]: + """Wrap an iterable with a progress handler. Every time an item is yielded, the progress handler will be called with the current progress.""" + if num_total is None: + num_total = len(list(iterable)) + + tick = ProgressTicker(progress, num_total) + + for item in iterable: + tick(1) + yield item diff --git a/graphrag/logger/rich_progress.py b/graphrag/logger/rich_progress.py index 22145d12df..818697a4f2 100644 --- a/graphrag/logger/rich_progress.py +++ b/graphrag/logger/rich_progress.py @@ -6,7 +6,6 @@ # Print iterations progress import asyncio -from datashaper import Progress as DSProgress from rich.console import Console, Group from rich.live import Live from rich.progress import Progress, TaskID, TimeElapsedColumn @@ -14,6 +13,7 @@ from rich.tree import Tree from graphrag.logger.base import ProgressLogger +from graphrag.logger.progress import Progress as GRProgress # https://stackoverflow.com/a/34325723 @@ -138,7 +138,7 @@ def info(self, message: str) -> None: """Log information.""" self._console.print(message) - def __call__(self, progress_update: DSProgress) -> None: + def __call__(self, progress_update: GRProgress) -> None: """Update progress.""" if self._disposing: return diff --git a/graphrag/prompt_tune/loader/input.py b/graphrag/prompt_tune/loader/input.py index e2b6c49b56..db8a95804a 100644 --- a/graphrag/prompt_tune/loader/input.py +++ b/graphrag/prompt_tune/loader/input.py @@ -5,11 +5,11 @@ import numpy as np import pandas as pd -from datashaper import NoopVerbCallbacks from fnllm import ChatLLM from pydantic import TypeAdapter import graphrag.config.defaults as defs +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks from graphrag.config.models.graph_rag_config import GraphRagConfig from graphrag.config.models.llm_parameters import LLMParameters from graphrag.index.input.factory import create_input diff --git a/graphrag/prompts/index/claim_extraction.py b/graphrag/prompts/index/claim_extraction.py index 05b3153c20..6ce3f0c2cc 100644 --- a/graphrag/prompts/index/claim_extraction.py +++ b/graphrag/prompts/index/claim_extraction.py @@ -58,4 +58,4 @@ CONTINUE_PROMPT = "MANY entities were missed in the last extraction. Add them below using the same format:\n" -LOOP_PROMPT = "It appears some entities may have still been missed. Answer YES {tuple_delimiter} NO if there are still entities that need to be added.\n" +LOOP_PROMPT = "It appears some entities may have still been missed. Answer Y or N if there are still entities that need to be added.\n" diff --git a/graphrag/prompts/index/entity_extraction.py b/graphrag/prompts/index/entity_extraction.py index cb1bcc668a..b1aaea3d3f 100644 --- a/graphrag/prompts/index/entity_extraction.py +++ b/graphrag/prompts/index/entity_extraction.py @@ -126,4 +126,4 @@ Output:""" CONTINUE_PROMPT = "MANY entities and relationships were missed in the last extraction. Remember to ONLY emit entities that match any of the previously extracted types. Add them below using the same format:\n" -LOOP_PROMPT = "It appears some entities and relationships may have still been missed. Answer YES | NO if there are still entities or relationships that need to be added.\n" +LOOP_PROMPT = "It appears some entities and relationships may have still been missed. Answer Y or N if there are still entities or relationships that need to be added.\n" diff --git a/graphrag/prompts/query/basic_search_system_prompt.py b/graphrag/prompts/query/basic_search_system_prompt.py new file mode 100644 index 0000000000..f98ea0582c --- /dev/null +++ b/graphrag/prompts/query/basic_search_system_prompt.py @@ -0,0 +1,68 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Basic Search prompts.""" + +BASIC_SEARCH_SYSTEM_PROMPT = """ +---Role--- + +You are a helpful assistant responding to questions about data in the tables provided. + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. + +If you don't know the answer, just say so. Do not make anything up. + +Points supported by data should list their data references as follows: + +"This is an example sentence supported by multiple text references [Data: Sources (record ids)]." + +Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. + +For example: + +"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16)]." + +where 15 and 16 represent the id (not the index) of the relevant data record. + +Do not include information where the supporting text for it is not provided. + + +---Target response length and format--- + +{response_type} + + +---Data tables--- + +{context_data} + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. + +If you don't know the answer, just say so. Do not make anything up. + +Points supported by data should list their data references as follows: + +"This is an example sentence supported by multiple text references [Data: Sources (record ids)]." + +Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. + +For example: + +"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16)]." + +where 15 and 16 represent the id (not the index) of the relevant data record. + +Do not include information where the supporting text for it is not provided. + +---Target response length and format--- + +{response_type} + +Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. +""" diff --git a/graphrag/query/context_builder/builders.py b/graphrag/query/context_builder/builders.py index 79d2164e21..2ae7b8cf70 100644 --- a/graphrag/query/context_builder/builders.py +++ b/graphrag/query/context_builder/builders.py @@ -60,3 +60,16 @@ def build_context( **kwargs, ) -> tuple[pd.DataFrame, dict[str, int]]: """Build the context for the primer search actions.""" + + +class BasicContextBuilder(ABC): + """Base class for basic-search context builders.""" + + @abstractmethod + def build_context( + self, + query: str, + conversation_history: ConversationHistory | None = None, + **kwargs, + ) -> ContextBuilderResult: + """Build the context for the basic search mode.""" diff --git a/graphrag/query/factory.py b/graphrag/query/factory.py index 5043b6db9e..8435d9cf96 100644 --- a/graphrag/query/factory.py +++ b/graphrag/query/factory.py @@ -14,6 +14,10 @@ from graphrag.model.text_unit import TextUnit from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey from graphrag.query.llm.get_client import get_llm, get_text_embedder +from graphrag.query.structured_search.basic_search.basic_context import ( + BasicSearchContext, +) +from graphrag.query.structured_search.basic_search.search import BasicSearch from graphrag.query.structured_search.drift_search.drift_context import ( DRIFTSearchContextBuilder, ) @@ -191,3 +195,43 @@ def get_drift_search_engine( ), token_encoder=token_encoder, ) + + +def get_basic_search_engine( + text_units: list[TextUnit], + text_unit_embeddings: BaseVectorStore, + config: GraphRagConfig, + system_prompt: str | None = None, +) -> BasicSearch: + """Create a basic search engine based on data + configuration.""" + llm = get_llm(config) + text_embedder = get_text_embedder(config) + token_encoder = tiktoken.get_encoding(config.encoding_model) + + ls_config = config.basic_search + + return BasicSearch( + llm=llm, + system_prompt=system_prompt, + context_builder=BasicSearchContext( + text_embedder=text_embedder, + text_unit_embeddings=text_unit_embeddings, + text_units=text_units, + token_encoder=token_encoder, + ), + token_encoder=token_encoder, + llm_params={ + "max_tokens": ls_config.llm_max_tokens, # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500) + "temperature": ls_config.temperature, + "top_p": ls_config.top_p, + "n": ls_config.n, + }, + context_builder_params={ + "text_unit_prop": ls_config.text_unit_prop, + "conversation_history_max_turns": ls_config.conversation_history_max_turns, + "conversation_history_user_turns_only": True, + "return_candidate_context": False, + "embedding_vectorstore_key": "id", + "max_tokens": ls_config.max_tokens, # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000) + }, + ) diff --git a/graphrag/query/structured_search/base.py b/graphrag/query/structured_search/base.py index 2657462347..749e89a07d 100644 --- a/graphrag/query/structured_search/base.py +++ b/graphrag/query/structured_search/base.py @@ -12,6 +12,7 @@ import tiktoken from graphrag.query.context_builder.builders import ( + BasicContextBuilder, DRIFTContextBuilder, GlobalContextBuilder, LocalContextBuilder, @@ -41,7 +42,13 @@ class SearchResult: output_tokens_categories: dict[str, int] | None = None -T = TypeVar("T", GlobalContextBuilder, LocalContextBuilder, DRIFTContextBuilder) +T = TypeVar( + "T", + GlobalContextBuilder, + LocalContextBuilder, + DRIFTContextBuilder, + BasicContextBuilder, +) class BaseSearch(ABC, Generic[T]): diff --git a/examples/single_verb/__init__.py b/graphrag/query/structured_search/basic_search/__init__.py similarity index 70% rename from examples/single_verb/__init__.py rename to graphrag/query/structured_search/basic_search/__init__.py index 0a3e38adfb..804a5d20d3 100644 --- a/examples/single_verb/__init__.py +++ b/graphrag/query/structured_search/basic_search/__init__.py @@ -1,2 +1,4 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License + +"""The BasicSearch package.""" diff --git a/graphrag/query/structured_search/basic_search/basic_context.py b/graphrag/query/structured_search/basic_search/basic_context.py new file mode 100644 index 0000000000..c8ae1ef0de --- /dev/null +++ b/graphrag/query/structured_search/basic_search/basic_context.py @@ -0,0 +1,61 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Basic Context Builder implementation.""" + +import pandas as pd +import tiktoken + +from graphrag.model.text_unit import TextUnit +from graphrag.query.context_builder.builders import ( + BasicContextBuilder, + ContextBuilderResult, +) +from graphrag.query.context_builder.conversation_history import ConversationHistory +from graphrag.query.llm.base import BaseTextEmbedding +from graphrag.vector_stores.base import BaseVectorStore + + +class BasicSearchContext(BasicContextBuilder): + """Class representing the Basic Search Context Builder.""" + + def __init__( + self, + text_embedder: BaseTextEmbedding, + text_unit_embeddings: BaseVectorStore, + text_units: list[TextUnit] | None = None, + token_encoder: tiktoken.Encoding | None = None, + embedding_vectorstore_key: str = "id", + ): + self.text_embedder = text_embedder + self.token_encoder = token_encoder + self.text_units = text_units + self.text_unit_embeddings = text_unit_embeddings + self.embedding_vectorstore_key = embedding_vectorstore_key + + def build_context( + self, + query: str, + conversation_history: ConversationHistory | None = None, + **kwargs, + ) -> ContextBuilderResult: + """Build the context for the local search mode.""" + search_results = self.text_unit_embeddings.similarity_search_by_text( + text=query, + text_embedder=lambda t: self.text_embedder.embed(t), + k=kwargs.get("k", 10), + ) + # we don't have a friendly id on text_units, so just copy the index + sources = [ + {"id": str(search_results.index(r)), "text": r.document.text} + for r in search_results + ] + # make a delimited table for the context; this imitates graphrag context building + table = ["id|text"] + [f"{s['id']}|{s['text']}" for s in sources] + + columns = pd.Index(["id", "text"]) + + return ContextBuilderResult( + context_chunks="\n\n".join(table), + context_records={"sources": pd.DataFrame(sources, columns=columns)}, + ) diff --git a/graphrag/query/structured_search/basic_search/search.py b/graphrag/query/structured_search/basic_search/search.py new file mode 100644 index 0000000000..b97213c9f9 --- /dev/null +++ b/graphrag/query/structured_search/basic_search/search.py @@ -0,0 +1,219 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""BasicSearch implementation.""" + +import logging +import time +from collections.abc import AsyncGenerator +from typing import Any + +import tiktoken + +from graphrag.prompts.query.basic_search_system_prompt import ( + BASIC_SEARCH_SYSTEM_PROMPT, +) +from graphrag.query.context_builder.builders import BasicContextBuilder +from graphrag.query.context_builder.conversation_history import ConversationHistory +from graphrag.query.llm.base import BaseLLM, BaseLLMCallback +from graphrag.query.llm.text_utils import num_tokens +from graphrag.query.structured_search.base import BaseSearch, SearchResult + +DEFAULT_LLM_PARAMS = { + "max_tokens": 1500, + "temperature": 0.0, +} + +log = logging.getLogger(__name__) +""" +Implementation of a generic RAG algorithm (vector search on raw text chunks) +""" + + +class BasicSearch(BaseSearch[BasicContextBuilder]): + """Search orchestration for basic search mode.""" + + def __init__( + self, + llm: BaseLLM, + context_builder: BasicContextBuilder, + token_encoder: tiktoken.Encoding | None = None, + system_prompt: str | None = None, + response_type: str = "multiple paragraphs", + callbacks: list[BaseLLMCallback] | None = None, + llm_params: dict[str, Any] = DEFAULT_LLM_PARAMS, + context_builder_params: dict | None = None, + ): + super().__init__( + llm=llm, + context_builder=context_builder, + token_encoder=token_encoder, + llm_params=llm_params, + context_builder_params=context_builder_params or {}, + ) + self.system_prompt = system_prompt or BASIC_SEARCH_SYSTEM_PROMPT + self.callbacks = callbacks + self.response_type = response_type + + async def asearch( + self, + query: str, + conversation_history: ConversationHistory | None = None, + **kwargs, + ) -> SearchResult: + """Build rag search context that fits a single context window and generate answer for the user query.""" + start_time = time.time() + search_prompt = "" + llm_calls, prompt_tokens, output_tokens = {}, {}, {} + + context_result = self.context_builder.build_context( + query=query, + conversation_history=conversation_history, + **kwargs, + **self.context_builder_params, + ) + + llm_calls["build_context"] = context_result.llm_calls + prompt_tokens["build_context"] = context_result.prompt_tokens + output_tokens["build_context"] = context_result.output_tokens + + log.info("GENERATE ANSWER: %s. QUERY: %s", start_time, query) + try: + search_prompt = self.system_prompt.format( + context_data=context_result.context_chunks, + response_type=self.response_type, + ) + search_messages = [ + {"role": "system", "content": search_prompt}, + {"role": "user", "content": query}, + ] + + response = await self.llm.agenerate( + messages=search_messages, + streaming=True, + callbacks=self.callbacks, + **self.llm_params, + ) + + llm_calls["response"] = 1 + prompt_tokens["response"] = num_tokens(search_prompt, self.token_encoder) + output_tokens["response"] = num_tokens(response, self.token_encoder) + + return SearchResult( + response=response, + context_data=context_result.context_records, + context_text=context_result.context_chunks, + completion_time=time.time() - start_time, + llm_calls=1, + prompt_tokens=num_tokens(search_prompt, self.token_encoder), + output_tokens=sum(output_tokens.values()), + ) + + except Exception: + log.exception("Exception in _asearch") + return SearchResult( + response="", + context_data=context_result.context_records, + context_text=context_result.context_chunks, + completion_time=time.time() - start_time, + llm_calls=1, + prompt_tokens=num_tokens(search_prompt, self.token_encoder), + output_tokens=0, + ) + + def search( + self, + query: str, + conversation_history: ConversationHistory | None = None, + **kwargs, + ) -> SearchResult: + """Build basic search context that fits a single context window and generate answer for the user question.""" + start_time = time.time() + search_prompt = "" + llm_calls, prompt_tokens, output_tokens = {}, {}, {} + context_result = self.context_builder.build_context( + query=query, + conversation_history=conversation_history, + **kwargs, + **self.context_builder_params, + ) + llm_calls["build_context"] = context_result.llm_calls + prompt_tokens["build_context"] = context_result.prompt_tokens + output_tokens["build_context"] = context_result.output_tokens + + log.info("GENERATE ANSWER: %d. QUERY: %s", start_time, query) + try: + search_prompt = self.system_prompt.format( + context_data=context_result.context_chunks, + response_type=self.response_type, + ) + search_messages = [ + {"role": "system", "content": search_prompt}, + {"role": "user", "content": query}, + ] + + response = self.llm.generate( + messages=search_messages, + streaming=True, + callbacks=self.callbacks, + **self.llm_params, + ) + llm_calls["response"] = 1 + prompt_tokens["response"] = num_tokens(search_prompt, self.token_encoder) + output_tokens["response"] = num_tokens(response, self.token_encoder) + + return SearchResult( + response=response, + context_data=context_result.context_records, + context_text=context_result.context_chunks, + completion_time=time.time() - start_time, + llm_calls=sum(llm_calls.values()), + prompt_tokens=sum(prompt_tokens.values()), + output_tokens=sum(output_tokens.values()), + llm_calls_categories=llm_calls, + prompt_tokens_categories=prompt_tokens, + output_tokens_categories=output_tokens, + ) + + except Exception: + log.exception("Exception in _map_response_single_batch") + return SearchResult( + response="", + context_data=context_result.context_records, + context_text=context_result.context_chunks, + completion_time=time.time() - start_time, + llm_calls=1, + prompt_tokens=num_tokens(search_prompt, self.token_encoder), + output_tokens=0, + ) + + async def astream_search( + self, + query: str, + conversation_history: ConversationHistory | None = None, + ) -> AsyncGenerator: + """Build basic search context that fits a single context window and generate answer for the user query.""" + start_time = time.time() + + context_result = self.context_builder.build_context( + query=query, + conversation_history=conversation_history, + **self.context_builder_params, + ) + log.info("GENERATE ANSWER: %s. QUERY: %s", start_time, query) + search_prompt = self.system_prompt.format( + context_data=context_result.context_chunks, response_type=self.response_type + ) + search_messages = [ + {"role": "system", "content": search_prompt}, + {"role": "user", "content": query}, + ] + + # send context records first before sending the reduce response + yield context_result.context_records + async for response in self.llm.astream_generate( # type: ignore + messages=search_messages, + callbacks=self.callbacks, + **self.llm_params, + ): + yield response diff --git a/graphrag/storage/blob_pipeline_storage.py b/graphrag/storage/blob_pipeline_storage.py index 701e59e25c..f72663052c 100644 --- a/graphrag/storage/blob_pipeline_storage.py +++ b/graphrag/storage/blob_pipeline_storage.py @@ -11,9 +11,9 @@ from azure.identity import DefaultAzureCredential from azure.storage.blob import BlobServiceClient -from datashaper import Progress from graphrag.logger.base import ProgressLogger +from graphrag.logger.progress import Progress from graphrag.storage.pipeline_storage import PipelineStorage log = logging.getLogger(__name__) diff --git a/graphrag/storage/cosmosdb_pipeline_storage.py b/graphrag/storage/cosmosdb_pipeline_storage.py index 9de9cf6dc0..c832ebc8bd 100644 --- a/graphrag/storage/cosmosdb_pipeline_storage.py +++ b/graphrag/storage/cosmosdb_pipeline_storage.py @@ -15,9 +15,9 @@ from azure.cosmos.exceptions import CosmosResourceNotFoundError from azure.cosmos.partition_key import PartitionKey from azure.identity import DefaultAzureCredential -from datashaper import Progress from graphrag.logger.base import ProgressLogger +from graphrag.logger.progress import Progress from graphrag.storage.pipeline_storage import PipelineStorage log = logging.getLogger(__name__) diff --git a/graphrag/storage/file_pipeline_storage.py b/graphrag/storage/file_pipeline_storage.py index f64df723b4..a2d45b89b3 100644 --- a/graphrag/storage/file_pipeline_storage.py +++ b/graphrag/storage/file_pipeline_storage.py @@ -14,9 +14,9 @@ import aiofiles from aiofiles.os import remove from aiofiles.ospath import exists -from datashaper import Progress from graphrag.logger.base import ProgressLogger +from graphrag.logger.progress import Progress from graphrag.storage.pipeline_storage import PipelineStorage log = logging.getLogger(__name__) diff --git a/graphrag/utils/storage.py b/graphrag/utils/storage.py index a28b0c4c1c..caf8003fc5 100644 --- a/graphrag/utils/storage.py +++ b/graphrag/utils/storage.py @@ -15,14 +15,15 @@ async def load_table_from_storage(name: str, storage: PipelineStorage) -> pd.DataFrame: """Load a parquet from the storage instance.""" - if not await storage.has(name): - msg = f"Could not find {name} in storage!" + filename = f"{name}.parquet" + if not await storage.has(filename): + msg = f"Could not find {filename} in storage!" raise ValueError(msg) try: - log.info("reading table from storage: %s", name) - return pd.read_parquet(BytesIO(await storage.get(name, as_bytes=True))) + log.info("reading table from storage: %s", filename) + return pd.read_parquet(BytesIO(await storage.get(filename, as_bytes=True))) except Exception: - log.exception("error loading table from storage: %s", name) + log.exception("error loading table from storage: %s", filename) raise @@ -30,4 +31,14 @@ async def write_table_to_storage( table: pd.DataFrame, name: str, storage: PipelineStorage ) -> None: """Write a table to storage.""" - await storage.set(name, table.to_parquet()) + await storage.set(f"{name}.parquet", table.to_parquet()) + + +async def delete_table_from_storage(name: str, storage: PipelineStorage) -> None: + """Delete a table to storage.""" + await storage.delete(f"{name}.parquet") + + +async def storage_has_table(name: str, storage: PipelineStorage) -> bool: + """Check if a table exists in storage.""" + return await storage.has(f"{name}.parquet") diff --git a/poetry.lock b/poetry.lock index dc21b3802e..cef2d2812f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. [[package]] name = "aiofiles" @@ -800,7 +800,6 @@ files = [ {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:761817a3377ef15ac23cd7834715081791d4ec77f9297ee694ca1ee9c2c7e5eb"}, {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3c672a53c0fb4725a29c303be906d3c1fa99c32f58abe008a82705f9ee96f40b"}, {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:4ac4c9f37eba52cb6fbeaf5b59c152ea976726b865bd4cf87883a7e7006cc543"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:60eb32934076fa07e4316b7b2742fa52cbb190b42c2df2863dbc4230a0a9b385"}, {file = "cryptography-44.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ed3534eb1090483c96178fcb0f8893719d96d5274dfde98aa6add34614e97c8e"}, {file = "cryptography-44.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f3f6fdfa89ee2d9d496e2c087cebef9d4fcbb0ad63c40e821b39f74bf48d9c5e"}, {file = "cryptography-44.0.0-cp37-abi3-win32.whl", hash = "sha256:eb33480f1bad5b78233b0ad3e1b0be21e8ef1da745d8d2aecbb20671658b9053"}, @@ -811,7 +810,6 @@ files = [ {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c5eb858beed7835e5ad1faba59e865109f3e52b3783b9ac21e7e47dc5554e289"}, {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f53c2c87e0fb4b0c00fa9571082a057e37690a8f12233306161c8f4b819960b7"}, {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:9e6fc8a08e116fb7c7dd1f040074c9d7b51d74a8ea40d4df2fc7aa08b76b9e6c"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:9abcc2e083cbe8dde89124a47e5e53ec38751f0d7dfd36801008f316a127d7ba"}, {file = "cryptography-44.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:d2436114e46b36d00f8b72ff57e598978b37399d2786fd39793c36c6d5cb1c64"}, {file = "cryptography-44.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a01956ddfa0a6790d594f5b34fc1bfa6098aca434696a03cfdbe469b8ed79285"}, {file = "cryptography-44.0.0-cp39-abi3-win32.whl", hash = "sha256:eca27345e1214d1b9f9490d200f9db5a874479be914199194e746c893788d417"}, @@ -853,23 +851,6 @@ files = [ docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] tests = ["pytest", "pytest-cov", "pytest-xdist"] -[[package]] -name = "datashaper" -version = "0.0.49" -description = "This project provides a collection of utilities for doing lightweight data wrangling." -optional = false -python-versions = ">=3.10,<4" -files = [ - {file = "datashaper-0.0.49-py3-none-any.whl", hash = "sha256:7f58cabacc834765595c6e04cfbbd05be6af71907e46ebc7a91d2a4add7c2643"}, - {file = "datashaper-0.0.49.tar.gz", hash = "sha256:05bfba5964474a62bdd5259ec3fa0173d01e365208b6a4aff4ea0e63096a7533"}, -] - -[package.dependencies] -diskcache = ">=5.6.3,<6.0.0" -jsonschema = ">=4.21.1,<5.0.0" -pandas = ">=2.2.0,<3.0.0" -pyarrow = ">=15.0.0,<16.0.0" - [[package]] name = "debugpy" version = "1.8.11" @@ -943,23 +924,27 @@ packaging = "*" [[package]] name = "deptry" -version = "0.21.1" +version = "0.21.2" description = "A command line utility to check for unused, missing and transitive dependencies in a Python project." optional = false python-versions = ">=3.9" files = [ - {file = "deptry-0.21.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c31e1a66502e28870e1e0a679598462a6119f4bcb656786e63cb545328170a3f"}, - {file = "deptry-0.21.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:4b53089c22d18076935a3e9e6325566fa712cd9b89fe602978a8e85f0f4209bf"}, - {file = "deptry-0.21.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5eae7afbcb9b7f6baa855b323e0da016a23f2a98d4b181dcfd2c71766512387"}, - {file = "deptry-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4afef1c5eb0b48ebc31de2437b460df0363cb99722252b7faf7fa6f43e10cbcd"}, - {file = "deptry-0.21.1-cp39-abi3-win_amd64.whl", hash = "sha256:981a28e1feeaad82f07a6e3c8d7842c5f6eae3807dc13b24d453a20cd0a42a72"}, - {file = "deptry-0.21.1-cp39-abi3-win_arm64.whl", hash = "sha256:98075550540c6b45f57abdfc453900bd2a179dc495d986ccc0757a813ee55103"}, - {file = "deptry-0.21.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:79593d7631cdbbc39d76503e3af80e46d8b4873e915b85c1567a04c81e8a17d5"}, - {file = "deptry-0.21.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:145a172ea608bb86dd93a9d14f7d45ed8649a36d7f685ea725e0348cbf562f10"}, - {file = "deptry-0.21.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e487f520d4fbee513f4767ab98334a29d5d932f78eb413b64e27c977f2bf2756"}, - {file = "deptry-0.21.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:091288cad2bd6029995d2e700e965cd574079365807f202ee232e4be0a571f43"}, - {file = "deptry-0.21.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1adf29a5aa1d33d9e1140b9235b212d9753278604b4389b2186f638692e29876"}, - {file = "deptry-0.21.1.tar.gz", hash = "sha256:60332b8d58d6584b340511a4e1b694048499f273d69eaea413631b2e8bc186ff"}, + {file = "deptry-0.21.2-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:e3b9e0c5ee437240b65e61107b5777a12064f78f604bf9f181a96c9b56eb896d"}, + {file = "deptry-0.21.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:d76bbf48bd62ecc44ca3d414769bd4b7956598d23d9ccb42fd359b831a31cab2"}, + {file = "deptry-0.21.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3080bb88c16ebd35f59cba7688416115b7aaf4630dc5a051dff2649cbf129a1b"}, + {file = "deptry-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adb12d6678fb5dbd320a0a2e37881059d0a45bec6329df4250c977d803fe7f96"}, + {file = "deptry-0.21.2-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:7479d3079be69c3bbf5913d8e21090749c1139ee91f81520ffce90b5322476b0"}, + {file = "deptry-0.21.2-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:019167b35301edd2bdd4719c8b8f44769be4507cb8a1cd46fff4393cdbe8d31b"}, + {file = "deptry-0.21.2-cp39-abi3-win_amd64.whl", hash = "sha256:d8add495f0dd19a38aa6d1e09b14b1441bca47c9d945bc7b322efb084313eea3"}, + {file = "deptry-0.21.2-cp39-abi3-win_arm64.whl", hash = "sha256:06d48e9fa460aad02f9e1b079d9f5a69d622d291b3a0525b722fc91c88032042"}, + {file = "deptry-0.21.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3ef8aed33a2eac357f9565063bc1257bcefa03a37038299c08a4222e28f3cd34"}, + {file = "deptry-0.21.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:917745db5f8295eb5048e43d9073a9a675ffdba865e9b294d2e7aa455730cb06"}, + {file = "deptry-0.21.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:186ddbc69c1f70e684e83e202795e1054d0c2dfc03b8acc077f65dc3b6a7f4ce"}, + {file = "deptry-0.21.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3686e86ad7063b5a6e5253454f9d9e4a7a6b1511a99bd4306fda5424480be48"}, + {file = "deptry-0.21.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:1012a88500f242489066f811f6ec0c93328d9340bbf0f87f0c7d2146054d197e"}, + {file = "deptry-0.21.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:769bb658172586d1b03046bdc6b6c94f6a98ecfbac04ff7f77ec61768c75e1c2"}, + {file = "deptry-0.21.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:fb2f43747b58abeec01dc277ef22859342f3bca2ac677818c94940a009b436c0"}, + {file = "deptry-0.21.2.tar.gz", hash = "sha256:4e870553c7a1fafcd99a83ba4137259525679eecabeff61bc669741efa201541"}, ] [package.dependencies] @@ -985,17 +970,6 @@ asttokens = ">=2.0.0,<3.0.0" executing = ">=1.1.1" pygments = ">=2.15.0" -[[package]] -name = "diskcache" -version = "5.6.3" -description = "Disk Cache -- Disk and file backed persistent cache." -optional = false -python-versions = ">=3" -files = [ - {file = "diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19"}, - {file = "diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc"}, -] - [[package]] name = "distro" version = "1.9.0" @@ -1424,13 +1398,13 @@ test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0)", "pytest-asyncio [[package]] name = "ipython" -version = "8.30.0" +version = "8.31.0" description = "IPython: Productive Interactive Computing" optional = false python-versions = ">=3.10" files = [ - {file = "ipython-8.30.0-py3-none-any.whl", hash = "sha256:85ec56a7e20f6c38fce7727dcca699ae4ffc85985aa7b23635a8008f918ae321"}, - {file = "ipython-8.30.0.tar.gz", hash = "sha256:cb0a405a306d2995a5cbb9901894d240784a9f341394c6ba3f4fe8c6eb89ff6e"}, + {file = "ipython-8.31.0-py3-none-any.whl", hash = "sha256:46ec58f8d3d076a61d128fe517a51eb730e3aaf0c184ea8c17d16e366660c6a6"}, + {file = "ipython-8.31.0.tar.gz", hash = "sha256:b6a2274606bec6166405ff05e54932ed6e5cfecaca1fc05f2cacde7bb074d70b"}, ] [package.dependencies] @@ -1804,13 +1778,13 @@ test = ["ipykernel", "pre-commit", "pytest (<8)", "pytest-cov", "pytest-timeout" [[package]] name = "jupyter-events" -version = "0.10.0" +version = "0.11.0" description = "Jupyter Event System library" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "jupyter_events-0.10.0-py3-none-any.whl", hash = "sha256:4b72130875e59d57716d327ea70d3ebc3af1944d3717e5a498b8a06c6c159960"}, - {file = "jupyter_events-0.10.0.tar.gz", hash = "sha256:670b8229d3cc882ec782144ed22e0d29e1c2d639263f92ca8383e66682845e22"}, + {file = "jupyter_events-0.11.0-py3-none-any.whl", hash = "sha256:36399b41ce1ca45fe8b8271067d6a140ffa54cec4028e95491c93b78a855cacf"}, + {file = "jupyter_events-0.11.0.tar.gz", hash = "sha256:c0bc56a37aac29c1fbc3bcfbddb8c8c49533f9cf11f1c4e6adadba936574ab90"}, ] [package.dependencies] @@ -1824,7 +1798,7 @@ traitlets = ">=5.3" [package.extras] cli = ["click", "rich"] -docs = ["jupyterlite-sphinx", "myst-parser", "pydata-sphinx-theme", "sphinxcontrib-spelling"] +docs = ["jupyterlite-sphinx", "myst-parser", "pydata-sphinx-theme (>=0.16)", "sphinx (>=8)", "sphinxcontrib-spelling"] test = ["click", "pre-commit", "pytest (>=7.0)", "pytest-asyncio (>=0.19.0)", "pytest-console-scripts", "rich"] [[package]] @@ -1843,13 +1817,13 @@ jupyter-server = ">=1.1.2" [[package]] name = "jupyter-server" -version = "2.14.2" +version = "2.15.0" description = "The backend—i.e. core services, APIs, and REST endpoints—to Jupyter web applications." optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "jupyter_server-2.14.2-py3-none-any.whl", hash = "sha256:47ff506127c2f7851a17bf4713434208fc490955d0e8632e95014a9a9afbeefd"}, - {file = "jupyter_server-2.14.2.tar.gz", hash = "sha256:66095021aa9638ced276c248b1d81862e4c50f292d575920bbe960de1c56b12b"}, + {file = "jupyter_server-2.15.0-py3-none-any.whl", hash = "sha256:872d989becf83517012ee669f09604aa4a28097c0bd90b2f424310156c2cdae3"}, + {file = "jupyter_server-2.15.0.tar.gz", hash = "sha256:9d446b8697b4f7337a1b7cdcac40778babdd93ba614b6d68ab1c0c918f1c4084"}, ] [package.dependencies] @@ -1858,7 +1832,7 @@ argon2-cffi = ">=21.1" jinja2 = ">=3.0.3" jupyter-client = ">=7.4.4" jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0" -jupyter-events = ">=0.9.0" +jupyter-events = ">=0.11.0" jupyter-server-terminals = ">=0.4.4" nbconvert = ">=6.4.4" nbformat = ">=5.3.0" @@ -1898,13 +1872,13 @@ test = ["jupyter-server (>=2.0.0)", "pytest (>=7.0)", "pytest-jupyter[server] (> [[package]] name = "jupyterlab" -version = "4.3.3" +version = "4.3.4" description = "JupyterLab computational environment" optional = false python-versions = ">=3.8" files = [ - {file = "jupyterlab-4.3.3-py3-none-any.whl", hash = "sha256:32a8fd30677e734ffcc3916a4758b9dab21b02015b668c60eb36f84357b7d4b1"}, - {file = "jupyterlab-4.3.3.tar.gz", hash = "sha256:76fa39e548fdac94dc1204af5956c556f54c785f70ee26aa47ea08eda4d5bbcd"}, + {file = "jupyterlab-4.3.4-py3-none-any.whl", hash = "sha256:b754c2601c5be6adf87cb5a1d8495d653ffb945f021939f77776acaa94dae952"}, + {file = "jupyterlab-4.3.4.tar.gz", hash = "sha256:f0bb9b09a04766e3423cccc2fc23169aa2ffedcdf8713e9e0fb33cac0b6859d0"}, ] [package.dependencies] @@ -1979,13 +1953,13 @@ files = [ [[package]] name = "jupytext" -version = "1.16.5" +version = "1.16.6" description = "Jupyter notebooks as Markdown documents, Julia, Python or R scripts" optional = false python-versions = ">=3.8" files = [ - {file = "jupytext-1.16.5-py3-none-any.whl", hash = "sha256:0c96841e364b0ac401e7f45ee67ee523d69eb7bee59476b8ee96ba39fc964491"}, - {file = "jupytext-1.16.5.tar.gz", hash = "sha256:2d5f896f11ebee8342f0f5f9c4818a336e12db164bcaec009ea612cd5dc2caa8"}, + {file = "jupytext-1.16.6-py3-none-any.whl", hash = "sha256:900132031f73fee15a1c9ebd862e05eb5f51e1ad6ab3a2c6fdd97ce2f9c913b4"}, + {file = "jupytext-1.16.6.tar.gz", hash = "sha256:dbd03f9263c34b737003f388fc069e9030834fb7136879c4c32c32473557baa0"}, ] [package.dependencies] @@ -2301,13 +2275,13 @@ files = [ [[package]] name = "marshmallow" -version = "3.23.1" +version = "3.23.2" description = "A lightweight library for converting complex datatypes to and from native Python datatypes." optional = false python-versions = ">=3.9" files = [ - {file = "marshmallow-3.23.1-py3-none-any.whl", hash = "sha256:fece2eb2c941180ea1b7fcbd4a83c51bfdd50093fdd3ad2585ee5e1df2508491"}, - {file = "marshmallow-3.23.1.tar.gz", hash = "sha256:3a8dfda6edd8dcdbf216c0ede1d1e78d230a6dc9c5a088f58c4083b974a0d468"}, + {file = "marshmallow-3.23.2-py3-none-any.whl", hash = "sha256:bcaf2d6fd74fb1459f8450e85d994997ad3e70036452cbfa4ab685acb19479b3"}, + {file = "marshmallow-3.23.2.tar.gz", hash = "sha256:c448ac6455ca4d794773f00bae22c2f351d62d739929f761dce5eacb5c468d7f"}, ] [package.dependencies] @@ -2611,13 +2585,13 @@ portalocker = ">=1.4,<3" [[package]] name = "nbclient" -version = "0.10.1" +version = "0.10.2" description = "A client library for executing notebooks. Formerly nbconvert's ExecutePreprocessor." optional = false -python-versions = ">=3.8.0" +python-versions = ">=3.9.0" files = [ - {file = "nbclient-0.10.1-py3-none-any.whl", hash = "sha256:949019b9240d66897e442888cfb618f69ef23dc71c01cb5fced8499c2cfc084d"}, - {file = "nbclient-0.10.1.tar.gz", hash = "sha256:3e93e348ab27e712acd46fccd809139e356eb9a31aab641d1a7991a6eb4e6f68"}, + {file = "nbclient-0.10.2-py3-none-any.whl", hash = "sha256:4ffee11e788b4a27fabeb7955547e4318a5298f34342a4bfd01f2e1faaeadc3d"}, + {file = "nbclient-0.10.2.tar.gz", hash = "sha256:90b7fc6b810630db87a6d0c2250b1f0ab4cf4d3c27a299b0cde78a4ed3fd9193"}, ] [package.dependencies] @@ -2628,8 +2602,8 @@ traitlets = ">=5.4" [package.extras] dev = ["pre-commit"] -docs = ["autodoc-traits", "flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "mock", "moto", "myst-parser", "nbconvert (>=7.0.0)", "pytest (>=7.0,<8)", "pytest-asyncio", "pytest-cov (>=4.0)", "sphinx (>=1.7)", "sphinx-book-theme", "sphinxcontrib-spelling", "testpath", "xmltodict"] -test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>=7.0.0)", "pytest (>=7.0,<8)", "pytest-asyncio", "pytest-cov (>=4.0)", "testpath", "xmltodict"] +docs = ["autodoc-traits", "flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "mock", "moto", "myst-parser", "nbconvert (>=7.1.0)", "pytest (>=7.0,<8)", "pytest-asyncio", "pytest-cov (>=4.0)", "sphinx (>=1.7)", "sphinx-book-theme", "sphinxcontrib-spelling", "testpath", "xmltodict"] +test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>=7.1.0)", "pytest (>=7.0,<8)", "pytest-asyncio", "pytest-cov (>=4.0)", "testpath", "xmltodict"] [[package]] name = "nbconvert" @@ -2876,13 +2850,13 @@ files = [ [[package]] name = "openai" -version = "1.57.4" +version = "1.58.1" description = "The official Python library for the openai API" optional = false python-versions = ">=3.8" files = [ - {file = "openai-1.57.4-py3-none-any.whl", hash = "sha256:7def1ab2d52f196357ce31b9cfcf4181529ce00838286426bb35be81c035dafb"}, - {file = "openai-1.57.4.tar.gz", hash = "sha256:a8f071a3e9198e2818f63aade68e759417b9f62c0971bdb83de82504b70b77f7"}, + {file = "openai-1.58.1-py3-none-any.whl", hash = "sha256:e2910b1170a6b7f88ef491ac3a42c387f08bd3db533411f7ee391d166571d63c"}, + {file = "openai-1.58.1.tar.gz", hash = "sha256:f5a035fd01e141fc743f4b0e02c41ca49be8fab0866d3b67f5f29b4f4d3c0973"}, ] [package.dependencies] @@ -2897,6 +2871,7 @@ typing-extensions = ">=4.11,<5" [package.extras] datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] +realtime = ["websockets (>=13,<15)"] [[package]] name = "overrides" @@ -3355,32 +3330,32 @@ wcwidth = "*" [[package]] name = "psutil" -version = "6.1.0" +version = "6.1.1" description = "Cross-platform lib for process and system monitoring in Python." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ - {file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"}, - {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"}, - {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:000d1d1ebd634b4efb383f4034437384e44a6d455260aaee2eca1e9c1b55f047"}, - {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:5cd2bcdc75b452ba2e10f0e8ecc0b57b827dd5d7aaffbc6821b2a9a242823a76"}, - {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:045f00a43c737f960d273a83973b2511430d61f283a44c96bf13a6e829ba8fdc"}, - {file = "psutil-6.1.0-cp27-none-win32.whl", hash = "sha256:9118f27452b70bb1d9ab3198c1f626c2499384935aaf55388211ad982611407e"}, - {file = "psutil-6.1.0-cp27-none-win_amd64.whl", hash = "sha256:a8506f6119cff7015678e2bce904a4da21025cc70ad283a53b099e7620061d85"}, - {file = "psutil-6.1.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6e2dcd475ce8b80522e51d923d10c7871e45f20918e027ab682f94f1c6351688"}, - {file = "psutil-6.1.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0895b8414afafc526712c498bd9de2b063deaac4021a3b3c34566283464aff8e"}, - {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9dcbfce5d89f1d1f2546a2090f4fcf87c7f669d1d90aacb7d7582addece9fb38"}, - {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:498c6979f9c6637ebc3a73b3f87f9eb1ec24e1ce53a7c5173b8508981614a90b"}, - {file = "psutil-6.1.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d905186d647b16755a800e7263d43df08b790d709d575105d419f8b6ef65423a"}, - {file = "psutil-6.1.0-cp36-cp36m-win32.whl", hash = "sha256:6d3fbbc8d23fcdcb500d2c9f94e07b1342df8ed71b948a2649b5cb060a7c94ca"}, - {file = "psutil-6.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:1209036fbd0421afde505a4879dee3b2fd7b1e14fee81c0069807adcbbcca747"}, - {file = "psutil-6.1.0-cp37-abi3-win32.whl", hash = "sha256:1ad45a1f5d0b608253b11508f80940985d1d0c8f6111b5cb637533a0e6ddc13e"}, - {file = "psutil-6.1.0-cp37-abi3-win_amd64.whl", hash = "sha256:a8fb3752b491d246034fa4d279ff076501588ce8cbcdbb62c32fd7a377d996be"}, - {file = "psutil-6.1.0.tar.gz", hash = "sha256:353815f59a7f64cdaca1c0307ee13558a0512f6db064e92fe833784f08539c7a"}, + {file = "psutil-6.1.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:9ccc4316f24409159897799b83004cb1e24f9819b0dcf9c0b68bdcb6cefee6a8"}, + {file = "psutil-6.1.1-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ca9609c77ea3b8481ab005da74ed894035936223422dc591d6772b147421f777"}, + {file = "psutil-6.1.1-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:8df0178ba8a9e5bc84fed9cfa61d54601b371fbec5c8eebad27575f1e105c0d4"}, + {file = "psutil-6.1.1-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:1924e659d6c19c647e763e78670a05dbb7feaf44a0e9c94bf9e14dfc6ba50468"}, + {file = "psutil-6.1.1-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:018aeae2af92d943fdf1da6b58665124897cfc94faa2ca92098838f83e1b1bca"}, + {file = "psutil-6.1.1-cp27-none-win32.whl", hash = "sha256:6d4281f5bbca041e2292be3380ec56a9413b790579b8e593b1784499d0005dac"}, + {file = "psutil-6.1.1-cp27-none-win_amd64.whl", hash = "sha256:c777eb75bb33c47377c9af68f30e9f11bc78e0f07fbf907be4a5d70b2fe5f030"}, + {file = "psutil-6.1.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:fc0ed7fe2231a444fc219b9c42d0376e0a9a1a72f16c5cfa0f68d19f1a0663e8"}, + {file = "psutil-6.1.1-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bdd4eab935276290ad3cb718e9809412895ca6b5b334f5a9111ee6d9aff9377"}, + {file = "psutil-6.1.1-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6e06c20c05fe95a3d7302d74e7097756d4ba1247975ad6905441ae1b5b66003"}, + {file = "psutil-6.1.1-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97f7cb9921fbec4904f522d972f0c0e1f4fabbdd4e0287813b21215074a0f160"}, + {file = "psutil-6.1.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33431e84fee02bc84ea36d9e2c4a6d395d479c9dd9bba2376c1f6ee8f3a4e0b3"}, + {file = "psutil-6.1.1-cp36-cp36m-win32.whl", hash = "sha256:384636b1a64b47814437d1173be1427a7c83681b17a450bfc309a1953e329603"}, + {file = "psutil-6.1.1-cp36-cp36m-win_amd64.whl", hash = "sha256:8be07491f6ebe1a693f17d4f11e69d0dc1811fa082736500f649f79df7735303"}, + {file = "psutil-6.1.1-cp37-abi3-win32.whl", hash = "sha256:eaa912e0b11848c4d9279a93d7e2783df352b082f40111e078388701fd479e53"}, + {file = "psutil-6.1.1-cp37-abi3-win_amd64.whl", hash = "sha256:f35cfccb065fff93529d2afb4a2e89e363fe63ca1e4a5da22b603a85833c2649"}, + {file = "psutil-6.1.1.tar.gz", hash = "sha256:cf8496728c18f2d0b45198f06895be52f36611711746b7f30c464b422b50e2f5"}, ] [package.extras] -dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "wheel"] +dev = ["abi3audit", "black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "vulture", "wheel"] test = ["pytest", "pytest-xdist", "setuptools"] [[package]] @@ -3486,18 +3461,18 @@ files = [ [[package]] name = "pydantic" -version = "2.10.3" +version = "2.10.4" description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic-2.10.3-py3-none-any.whl", hash = "sha256:be04d85bbc7b65651c5f8e6b9976ed9c6f41782a55524cef079a34a0bb82144d"}, - {file = "pydantic-2.10.3.tar.gz", hash = "sha256:cb5ac360ce894ceacd69c403187900a02c4b20b693a9dd1d643e1effab9eadf9"}, + {file = "pydantic-2.10.4-py3-none-any.whl", hash = "sha256:597e135ea68be3a37552fb524bc7d0d66dcf93d395acd93a00682f1efcb8ee3d"}, + {file = "pydantic-2.10.4.tar.gz", hash = "sha256:82f12e9723da6de4fe2ba888b5971157b3be7ad914267dea8f05f82b28254f06"}, ] [package.dependencies] annotated-types = ">=0.6.0" -pydantic-core = "2.27.1" +pydantic-core = "2.27.2" typing-extensions = ">=4.12.2" [package.extras] @@ -3506,111 +3481,111 @@ timezone = ["tzdata"] [[package]] name = "pydantic-core" -version = "2.27.1" +version = "2.27.2" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic_core-2.27.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:71a5e35c75c021aaf400ac048dacc855f000bdfed91614b4a726f7432f1f3d6a"}, - {file = "pydantic_core-2.27.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f82d068a2d6ecfc6e054726080af69a6764a10015467d7d7b9f66d6ed5afa23b"}, - {file = "pydantic_core-2.27.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:121ceb0e822f79163dd4699e4c54f5ad38b157084d97b34de8b232bcaad70278"}, - {file = "pydantic_core-2.27.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4603137322c18eaf2e06a4495f426aa8d8388940f3c457e7548145011bb68e05"}, - {file = "pydantic_core-2.27.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a33cd6ad9017bbeaa9ed78a2e0752c5e250eafb9534f308e7a5f7849b0b1bfb4"}, - {file = "pydantic_core-2.27.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:15cc53a3179ba0fcefe1e3ae50beb2784dede4003ad2dfd24f81bba4b23a454f"}, - {file = "pydantic_core-2.27.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45d9c5eb9273aa50999ad6adc6be5e0ecea7e09dbd0d31bd0c65a55a2592ca08"}, - {file = "pydantic_core-2.27.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8bf7b66ce12a2ac52d16f776b31d16d91033150266eb796967a7e4621707e4f6"}, - {file = "pydantic_core-2.27.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:655d7dd86f26cb15ce8a431036f66ce0318648f8853d709b4167786ec2fa4807"}, - {file = "pydantic_core-2.27.1-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:5556470f1a2157031e676f776c2bc20acd34c1990ca5f7e56f1ebf938b9ab57c"}, - {file = "pydantic_core-2.27.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f69ed81ab24d5a3bd93861c8c4436f54afdf8e8cc421562b0c7504cf3be58206"}, - {file = "pydantic_core-2.27.1-cp310-none-win32.whl", hash = "sha256:f5a823165e6d04ccea61a9f0576f345f8ce40ed533013580e087bd4d7442b52c"}, - {file = "pydantic_core-2.27.1-cp310-none-win_amd64.whl", hash = "sha256:57866a76e0b3823e0b56692d1a0bf722bffb324839bb5b7226a7dbd6c9a40b17"}, - {file = "pydantic_core-2.27.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:ac3b20653bdbe160febbea8aa6c079d3df19310d50ac314911ed8cc4eb7f8cb8"}, - {file = "pydantic_core-2.27.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a5a8e19d7c707c4cadb8c18f5f60c843052ae83c20fa7d44f41594c644a1d330"}, - {file = "pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f7059ca8d64fea7f238994c97d91f75965216bcbe5f695bb44f354893f11d52"}, - {file = "pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bed0f8a0eeea9fb72937ba118f9db0cb7e90773462af7962d382445f3005e5a4"}, - {file = "pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a3cb37038123447cf0f3ea4c74751f6a9d7afef0eb71aa07bf5f652b5e6a132c"}, - {file = "pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:84286494f6c5d05243456e04223d5a9417d7f443c3b76065e75001beb26f88de"}, - {file = "pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:acc07b2cfc5b835444b44a9956846b578d27beeacd4b52e45489e93276241025"}, - {file = "pydantic_core-2.27.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4fefee876e07a6e9aad7a8c8c9f85b0cdbe7df52b8a9552307b09050f7512c7e"}, - {file = "pydantic_core-2.27.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:258c57abf1188926c774a4c94dd29237e77eda19462e5bb901d88adcab6af919"}, - {file = "pydantic_core-2.27.1-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:35c14ac45fcfdf7167ca76cc80b2001205a8d5d16d80524e13508371fb8cdd9c"}, - {file = "pydantic_core-2.27.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d1b26e1dff225c31897696cab7d4f0a315d4c0d9e8666dbffdb28216f3b17fdc"}, - {file = "pydantic_core-2.27.1-cp311-none-win32.whl", hash = "sha256:2cdf7d86886bc6982354862204ae3b2f7f96f21a3eb0ba5ca0ac42c7b38598b9"}, - {file = "pydantic_core-2.27.1-cp311-none-win_amd64.whl", hash = "sha256:3af385b0cee8df3746c3f406f38bcbfdc9041b5c2d5ce3e5fc6637256e60bbc5"}, - {file = "pydantic_core-2.27.1-cp311-none-win_arm64.whl", hash = "sha256:81f2ec23ddc1b476ff96563f2e8d723830b06dceae348ce02914a37cb4e74b89"}, - {file = "pydantic_core-2.27.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9cbd94fc661d2bab2bc702cddd2d3370bbdcc4cd0f8f57488a81bcce90c7a54f"}, - {file = "pydantic_core-2.27.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5f8c4718cd44ec1580e180cb739713ecda2bdee1341084c1467802a417fe0f02"}, - {file = "pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15aae984e46de8d376df515f00450d1522077254ef6b7ce189b38ecee7c9677c"}, - {file = "pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1ba5e3963344ff25fc8c40da90f44b0afca8cfd89d12964feb79ac1411a260ac"}, - {file = "pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:992cea5f4f3b29d6b4f7f1726ed8ee46c8331c6b4eed6db5b40134c6fe1768bb"}, - {file = "pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0325336f348dbee6550d129b1627cb8f5351a9dc91aad141ffb96d4937bd9529"}, - {file = "pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7597c07fbd11515f654d6ece3d0e4e5093edc30a436c63142d9a4b8e22f19c35"}, - {file = "pydantic_core-2.27.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3bbd5d8cc692616d5ef6fbbbd50dbec142c7e6ad9beb66b78a96e9c16729b089"}, - {file = "pydantic_core-2.27.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:dc61505e73298a84a2f317255fcc72b710b72980f3a1f670447a21efc88f8381"}, - {file = "pydantic_core-2.27.1-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:e1f735dc43da318cad19b4173dd1ffce1d84aafd6c9b782b3abc04a0d5a6f5bb"}, - {file = "pydantic_core-2.27.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f4e5658dbffe8843a0f12366a4c2d1c316dbe09bb4dfbdc9d2d9cd6031de8aae"}, - {file = "pydantic_core-2.27.1-cp312-none-win32.whl", hash = "sha256:672ebbe820bb37988c4d136eca2652ee114992d5d41c7e4858cdd90ea94ffe5c"}, - {file = "pydantic_core-2.27.1-cp312-none-win_amd64.whl", hash = "sha256:66ff044fd0bb1768688aecbe28b6190f6e799349221fb0de0e6f4048eca14c16"}, - {file = "pydantic_core-2.27.1-cp312-none-win_arm64.whl", hash = "sha256:9a3b0793b1bbfd4146304e23d90045f2a9b5fd5823aa682665fbdaf2a6c28f3e"}, - {file = "pydantic_core-2.27.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:f216dbce0e60e4d03e0c4353c7023b202d95cbaeff12e5fd2e82ea0a66905073"}, - {file = "pydantic_core-2.27.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a2e02889071850bbfd36b56fd6bc98945e23670773bc7a76657e90e6b6603c08"}, - {file = "pydantic_core-2.27.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42b0e23f119b2b456d07ca91b307ae167cc3f6c846a7b169fca5326e32fdc6cf"}, - {file = "pydantic_core-2.27.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:764be71193f87d460a03f1f7385a82e226639732214b402f9aa61f0d025f0737"}, - {file = "pydantic_core-2.27.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1c00666a3bd2f84920a4e94434f5974d7bbc57e461318d6bb34ce9cdbbc1f6b2"}, - {file = "pydantic_core-2.27.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3ccaa88b24eebc0f849ce0a4d09e8a408ec5a94afff395eb69baf868f5183107"}, - {file = "pydantic_core-2.27.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c65af9088ac534313e1963443d0ec360bb2b9cba6c2909478d22c2e363d98a51"}, - {file = "pydantic_core-2.27.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:206b5cf6f0c513baffaeae7bd817717140770c74528f3e4c3e1cec7871ddd61a"}, - {file = "pydantic_core-2.27.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:062f60e512fc7fff8b8a9d680ff0ddaaef0193dba9fa83e679c0c5f5fbd018bc"}, - {file = "pydantic_core-2.27.1-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:a0697803ed7d4af5e4c1adf1670af078f8fcab7a86350e969f454daf598c4960"}, - {file = "pydantic_core-2.27.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:58ca98a950171f3151c603aeea9303ef6c235f692fe555e883591103da709b23"}, - {file = "pydantic_core-2.27.1-cp313-none-win32.whl", hash = "sha256:8065914ff79f7eab1599bd80406681f0ad08f8e47c880f17b416c9f8f7a26d05"}, - {file = "pydantic_core-2.27.1-cp313-none-win_amd64.whl", hash = "sha256:ba630d5e3db74c79300d9a5bdaaf6200172b107f263c98a0539eeecb857b2337"}, - {file = "pydantic_core-2.27.1-cp313-none-win_arm64.whl", hash = "sha256:45cf8588c066860b623cd11c4ba687f8d7175d5f7ef65f7129df8a394c502de5"}, - {file = "pydantic_core-2.27.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:5897bec80a09b4084aee23f9b73a9477a46c3304ad1d2d07acca19723fb1de62"}, - {file = "pydantic_core-2.27.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d0165ab2914379bd56908c02294ed8405c252250668ebcb438a55494c69f44ab"}, - {file = "pydantic_core-2.27.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b9af86e1d8e4cfc82c2022bfaa6f459381a50b94a29e95dcdda8442d6d83864"}, - {file = "pydantic_core-2.27.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f6c8a66741c5f5447e047ab0ba7a1c61d1e95580d64bce852e3df1f895c4067"}, - {file = "pydantic_core-2.27.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a42d6a8156ff78981f8aa56eb6394114e0dedb217cf8b729f438f643608cbcd"}, - {file = "pydantic_core-2.27.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:64c65f40b4cd8b0e049a8edde07e38b476da7e3aaebe63287c899d2cff253fa5"}, - {file = "pydantic_core-2.27.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdcf339322a3fae5cbd504edcefddd5a50d9ee00d968696846f089b4432cf78"}, - {file = "pydantic_core-2.27.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bf99c8404f008750c846cb4ac4667b798a9f7de673ff719d705d9b2d6de49c5f"}, - {file = "pydantic_core-2.27.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8f1edcea27918d748c7e5e4d917297b2a0ab80cad10f86631e488b7cddf76a36"}, - {file = "pydantic_core-2.27.1-cp38-cp38-musllinux_1_1_armv7l.whl", hash = "sha256:159cac0a3d096f79ab6a44d77a961917219707e2a130739c64d4dd46281f5c2a"}, - {file = "pydantic_core-2.27.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:029d9757eb621cc6e1848fa0b0310310de7301057f623985698ed7ebb014391b"}, - {file = "pydantic_core-2.27.1-cp38-none-win32.whl", hash = "sha256:a28af0695a45f7060e6f9b7092558a928a28553366519f64083c63a44f70e618"}, - {file = "pydantic_core-2.27.1-cp38-none-win_amd64.whl", hash = "sha256:2d4567c850905d5eaaed2f7a404e61012a51caf288292e016360aa2b96ff38d4"}, - {file = "pydantic_core-2.27.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:e9386266798d64eeb19dd3677051f5705bf873e98e15897ddb7d76f477131967"}, - {file = "pydantic_core-2.27.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4228b5b646caa73f119b1ae756216b59cc6e2267201c27d3912b592c5e323b60"}, - {file = "pydantic_core-2.27.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b3dfe500de26c52abe0477dde16192ac39c98f05bf2d80e76102d394bd13854"}, - {file = "pydantic_core-2.27.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:aee66be87825cdf72ac64cb03ad4c15ffef4143dbf5c113f64a5ff4f81477bf9"}, - {file = "pydantic_core-2.27.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b748c44bb9f53031c8cbc99a8a061bc181c1000c60a30f55393b6e9c45cc5bd"}, - {file = "pydantic_core-2.27.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ca038c7f6a0afd0b2448941b6ef9d5e1949e999f9e5517692eb6da58e9d44be"}, - {file = "pydantic_core-2.27.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e0bd57539da59a3e4671b90a502da9a28c72322a4f17866ba3ac63a82c4498e"}, - {file = "pydantic_core-2.27.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ac6c2c45c847bbf8f91930d88716a0fb924b51e0c6dad329b793d670ec5db792"}, - {file = "pydantic_core-2.27.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b94d4ba43739bbe8b0ce4262bcc3b7b9f31459ad120fb595627eaeb7f9b9ca01"}, - {file = "pydantic_core-2.27.1-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:00e6424f4b26fe82d44577b4c842d7df97c20be6439e8e685d0d715feceb9fb9"}, - {file = "pydantic_core-2.27.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:38de0a70160dd97540335b7ad3a74571b24f1dc3ed33f815f0880682e6880131"}, - {file = "pydantic_core-2.27.1-cp39-none-win32.whl", hash = "sha256:7ccebf51efc61634f6c2344da73e366c75e735960b5654b63d7e6f69a5885fa3"}, - {file = "pydantic_core-2.27.1-cp39-none-win_amd64.whl", hash = "sha256:a57847b090d7892f123726202b7daa20df6694cbd583b67a592e856bff603d6c"}, - {file = "pydantic_core-2.27.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3fa80ac2bd5856580e242dbc202db873c60a01b20309c8319b5c5986fbe53ce6"}, - {file = "pydantic_core-2.27.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d950caa237bb1954f1b8c9227b5065ba6875ac9771bb8ec790d956a699b78676"}, - {file = "pydantic_core-2.27.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e4216e64d203e39c62df627aa882f02a2438d18a5f21d7f721621f7a5d3611d"}, - {file = "pydantic_core-2.27.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02a3d637bd387c41d46b002f0e49c52642281edacd2740e5a42f7017feea3f2c"}, - {file = "pydantic_core-2.27.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:161c27ccce13b6b0c8689418da3885d3220ed2eae2ea5e9b2f7f3d48f1d52c27"}, - {file = "pydantic_core-2.27.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:19910754e4cc9c63bc1c7f6d73aa1cfee82f42007e407c0f413695c2f7ed777f"}, - {file = "pydantic_core-2.27.1-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:e173486019cc283dc9778315fa29a363579372fe67045e971e89b6365cc035ed"}, - {file = "pydantic_core-2.27.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:af52d26579b308921b73b956153066481f064875140ccd1dfd4e77db89dbb12f"}, - {file = "pydantic_core-2.27.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:981fb88516bd1ae8b0cbbd2034678a39dedc98752f264ac9bc5839d3923fa04c"}, - {file = "pydantic_core-2.27.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5fde892e6c697ce3e30c61b239330fc5d569a71fefd4eb6512fc6caec9dd9e2f"}, - {file = "pydantic_core-2.27.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:816f5aa087094099fff7edabb5e01cc370eb21aa1a1d44fe2d2aefdfb5599b31"}, - {file = "pydantic_core-2.27.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c10c309e18e443ddb108f0ef64e8729363adbfd92d6d57beec680f6261556f3"}, - {file = "pydantic_core-2.27.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98476c98b02c8e9b2eec76ac4156fd006628b1b2d0ef27e548ffa978393fd154"}, - {file = "pydantic_core-2.27.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c3027001c28434e7ca5a6e1e527487051136aa81803ac812be51802150d880dd"}, - {file = "pydantic_core-2.27.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:7699b1df36a48169cdebda7ab5a2bac265204003f153b4bd17276153d997670a"}, - {file = "pydantic_core-2.27.1-pp39-pypy39_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:1c39b07d90be6b48968ddc8c19e7585052088fd7ec8d568bb31ff64c70ae3c97"}, - {file = "pydantic_core-2.27.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:46ccfe3032b3915586e469d4972973f893c0a2bb65669194a5bdea9bacc088c2"}, - {file = "pydantic_core-2.27.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:62ba45e21cf6571d7f716d903b5b7b6d2617e2d5d67c0923dc47b9d41369f840"}, - {file = "pydantic_core-2.27.1.tar.gz", hash = "sha256:62a763352879b84aa31058fc931884055fd75089cccbd9d58bb6afd01141b235"}, + {file = "pydantic_core-2.27.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2d367ca20b2f14095a8f4fa1210f5a7b78b8a20009ecced6b12818f455b1e9fa"}, + {file = "pydantic_core-2.27.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:491a2b73db93fab69731eaee494f320faa4e093dbed776be1a829c2eb222c34c"}, + {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7969e133a6f183be60e9f6f56bfae753585680f3b7307a8e555a948d443cc05a"}, + {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3de9961f2a346257caf0aa508a4da705467f53778e9ef6fe744c038119737ef5"}, + {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2bb4d3e5873c37bb3dd58714d4cd0b0e6238cebc4177ac8fe878f8b3aa8e74c"}, + {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:280d219beebb0752699480fe8f1dc61ab6615c2046d76b7ab7ee38858de0a4e7"}, + {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47956ae78b6422cbd46f772f1746799cbb862de838fd8d1fbd34a82e05b0983a"}, + {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:14d4a5c49d2f009d62a2a7140d3064f686d17a5d1a268bc641954ba181880236"}, + {file = "pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:337b443af21d488716f8d0b6164de833e788aa6bd7e3a39c005febc1284f4962"}, + {file = "pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:03d0f86ea3184a12f41a2d23f7ccb79cdb5a18e06993f8a45baa8dfec746f0e9"}, + {file = "pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7041c36f5680c6e0f08d922aed302e98b3745d97fe1589db0a3eebf6624523af"}, + {file = "pydantic_core-2.27.2-cp310-cp310-win32.whl", hash = "sha256:50a68f3e3819077be2c98110c1f9dcb3817e93f267ba80a2c05bb4f8799e2ff4"}, + {file = "pydantic_core-2.27.2-cp310-cp310-win_amd64.whl", hash = "sha256:e0fd26b16394ead34a424eecf8a31a1f5137094cabe84a1bcb10fa6ba39d3d31"}, + {file = "pydantic_core-2.27.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:8e10c99ef58cfdf2a66fc15d66b16c4a04f62bca39db589ae8cba08bc55331bc"}, + {file = "pydantic_core-2.27.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:26f32e0adf166a84d0cb63be85c562ca8a6fa8de28e5f0d92250c6b7e9e2aff7"}, + {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c19d1ea0673cd13cc2f872f6c9ab42acc4e4f492a7ca9d3795ce2b112dd7e15"}, + {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e68c4446fe0810e959cdff46ab0a41ce2f2c86d227d96dc3847af0ba7def306"}, + {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9640b0059ff4f14d1f37321b94061c6db164fbe49b334b31643e0528d100d99"}, + {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:40d02e7d45c9f8af700f3452f329ead92da4c5f4317ca9b896de7ce7199ea459"}, + {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c1fd185014191700554795c99b347d64f2bb637966c4cfc16998a0ca700d048"}, + {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d81d2068e1c1228a565af076598f9e7451712700b673de8f502f0334f281387d"}, + {file = "pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1a4207639fb02ec2dbb76227d7c751a20b1a6b4bc52850568e52260cae64ca3b"}, + {file = "pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:3de3ce3c9ddc8bbd88f6e0e304dea0e66d843ec9de1b0042b0911c1663ffd474"}, + {file = "pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:30c5f68ded0c36466acede341551106821043e9afaad516adfb6e8fa80a4e6a6"}, + {file = "pydantic_core-2.27.2-cp311-cp311-win32.whl", hash = "sha256:c70c26d2c99f78b125a3459f8afe1aed4d9687c24fd677c6a4436bc042e50d6c"}, + {file = "pydantic_core-2.27.2-cp311-cp311-win_amd64.whl", hash = "sha256:08e125dbdc505fa69ca7d9c499639ab6407cfa909214d500897d02afb816e7cc"}, + {file = "pydantic_core-2.27.2-cp311-cp311-win_arm64.whl", hash = "sha256:26f0d68d4b235a2bae0c3fc585c585b4ecc51382db0e3ba402a22cbc440915e4"}, + {file = "pydantic_core-2.27.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9e0c8cfefa0ef83b4da9588448b6d8d2a2bf1a53c3f1ae5fca39eb3061e2f0b0"}, + {file = "pydantic_core-2.27.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83097677b8e3bd7eaa6775720ec8e0405f1575015a463285a92bfdfe254529ef"}, + {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:172fce187655fece0c90d90a678424b013f8fbb0ca8b036ac266749c09438cb7"}, + {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:519f29f5213271eeeeb3093f662ba2fd512b91c5f188f3bb7b27bc5973816934"}, + {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05e3a55d124407fffba0dd6b0c0cd056d10e983ceb4e5dbd10dda135c31071d6"}, + {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c3ed807c7b91de05e63930188f19e921d1fe90de6b4f5cd43ee7fcc3525cb8c"}, + {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fb4aadc0b9a0c063206846d603b92030eb6f03069151a625667f982887153e2"}, + {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28ccb213807e037460326424ceb8b5245acb88f32f3d2777427476e1b32c48c4"}, + {file = "pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:de3cd1899e2c279b140adde9357c4495ed9d47131b4a4eaff9052f23398076b3"}, + {file = "pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:220f892729375e2d736b97d0e51466252ad84c51857d4d15f5e9692f9ef12be4"}, + {file = "pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a0fcd29cd6b4e74fe8ddd2c90330fd8edf2e30cb52acda47f06dd615ae72da57"}, + {file = "pydantic_core-2.27.2-cp312-cp312-win32.whl", hash = "sha256:1e2cb691ed9834cd6a8be61228471d0a503731abfb42f82458ff27be7b2186fc"}, + {file = "pydantic_core-2.27.2-cp312-cp312-win_amd64.whl", hash = "sha256:cc3f1a99a4f4f9dd1de4fe0312c114e740b5ddead65bb4102884b384c15d8bc9"}, + {file = "pydantic_core-2.27.2-cp312-cp312-win_arm64.whl", hash = "sha256:3911ac9284cd8a1792d3cb26a2da18f3ca26c6908cc434a18f730dc0db7bfa3b"}, + {file = "pydantic_core-2.27.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7d14bd329640e63852364c306f4d23eb744e0f8193148d4044dd3dacdaacbd8b"}, + {file = "pydantic_core-2.27.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:82f91663004eb8ed30ff478d77c4d1179b3563df6cdb15c0817cd1cdaf34d154"}, + {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71b24c7d61131bb83df10cc7e687433609963a944ccf45190cfc21e0887b08c9"}, + {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa8e459d4954f608fa26116118bb67f56b93b209c39b008277ace29937453dc9"}, + {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce8918cbebc8da707ba805b7fd0b382816858728ae7fe19a942080c24e5b7cd1"}, + {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eda3f5c2a021bbc5d976107bb302e0131351c2ba54343f8a496dc8783d3d3a6a"}, + {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8086fa684c4775c27f03f062cbb9eaa6e17f064307e86b21b9e0abc9c0f02e"}, + {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8d9b3388db186ba0c099a6d20f0604a44eabdeef1777ddd94786cdae158729e4"}, + {file = "pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7a66efda2387de898c8f38c0cf7f14fca0b51a8ef0b24bfea5849f1b3c95af27"}, + {file = "pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:18a101c168e4e092ab40dbc2503bdc0f62010e95d292b27827871dc85450d7ee"}, + {file = "pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ba5dd002f88b78a4215ed2f8ddbdf85e8513382820ba15ad5ad8955ce0ca19a1"}, + {file = "pydantic_core-2.27.2-cp313-cp313-win32.whl", hash = "sha256:1ebaf1d0481914d004a573394f4be3a7616334be70261007e47c2a6fe7e50130"}, + {file = "pydantic_core-2.27.2-cp313-cp313-win_amd64.whl", hash = "sha256:953101387ecf2f5652883208769a79e48db18c6df442568a0b5ccd8c2723abee"}, + {file = "pydantic_core-2.27.2-cp313-cp313-win_arm64.whl", hash = "sha256:ac4dbfd1691affb8f48c2c13241a2e3b60ff23247cbcf981759c768b6633cf8b"}, + {file = "pydantic_core-2.27.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d3e8d504bdd3f10835468f29008d72fc8359d95c9c415ce6e767203db6127506"}, + {file = "pydantic_core-2.27.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:521eb9b7f036c9b6187f0b47318ab0d7ca14bd87f776240b90b21c1f4f149320"}, + {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85210c4d99a0114f5a9481b44560d7d1e35e32cc5634c656bc48e590b669b145"}, + {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d716e2e30c6f140d7560ef1538953a5cd1a87264c737643d481f2779fc247fe1"}, + {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f66d89ba397d92f840f8654756196d93804278457b5fbede59598a1f9f90b228"}, + {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:669e193c1c576a58f132e3158f9dfa9662969edb1a250c54d8fa52590045f046"}, + {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdbe7629b996647b99c01b37f11170a57ae675375b14b8c13b8518b8320ced5"}, + {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d262606bf386a5ba0b0af3b97f37c83d7011439e3dc1a9298f21efb292e42f1a"}, + {file = "pydantic_core-2.27.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:cabb9bcb7e0d97f74df8646f34fc76fbf793b7f6dc2438517d7a9e50eee4f14d"}, + {file = "pydantic_core-2.27.2-cp38-cp38-musllinux_1_1_armv7l.whl", hash = "sha256:d2d63f1215638d28221f664596b1ccb3944f6e25dd18cd3b86b0a4c408d5ebb9"}, + {file = "pydantic_core-2.27.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bca101c00bff0adb45a833f8451b9105d9df18accb8743b08107d7ada14bd7da"}, + {file = "pydantic_core-2.27.2-cp38-cp38-win32.whl", hash = "sha256:f6f8e111843bbb0dee4cb6594cdc73e79b3329b526037ec242a3e49012495b3b"}, + {file = "pydantic_core-2.27.2-cp38-cp38-win_amd64.whl", hash = "sha256:fd1aea04935a508f62e0d0ef1f5ae968774a32afc306fb8545e06f5ff5cdf3ad"}, + {file = "pydantic_core-2.27.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:c10eb4f1659290b523af58fa7cffb452a61ad6ae5613404519aee4bfbf1df993"}, + {file = "pydantic_core-2.27.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef592d4bad47296fb11f96cd7dc898b92e795032b4894dfb4076cfccd43a9308"}, + {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c61709a844acc6bf0b7dce7daae75195a10aac96a596ea1b776996414791ede4"}, + {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42c5f762659e47fdb7b16956c71598292f60a03aa92f8b6351504359dbdba6cf"}, + {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4c9775e339e42e79ec99c441d9730fccf07414af63eac2f0e48e08fd38a64d76"}, + {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57762139821c31847cfb2df63c12f725788bd9f04bc2fb392790959b8f70f118"}, + {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d1e85068e818c73e048fe28cfc769040bb1f475524f4745a5dc621f75ac7630"}, + {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:097830ed52fd9e427942ff3b9bc17fab52913b2f50f2880dc4a5611446606a54"}, + {file = "pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:044a50963a614ecfae59bb1eaf7ea7efc4bc62f49ed594e18fa1e5d953c40e9f"}, + {file = "pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:4e0b4220ba5b40d727c7f879eac379b822eee5d8fff418e9d3381ee45b3b0362"}, + {file = "pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5e4f4bb20d75e9325cc9696c6802657b58bc1dbbe3022f32cc2b2b632c3fbb96"}, + {file = "pydantic_core-2.27.2-cp39-cp39-win32.whl", hash = "sha256:cca63613e90d001b9f2f9a9ceb276c308bfa2a43fafb75c8031c4f66039e8c6e"}, + {file = "pydantic_core-2.27.2-cp39-cp39-win_amd64.whl", hash = "sha256:77d1bca19b0f7021b3a982e6f903dcd5b2b06076def36a652e3907f596e29f67"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:2bf14caea37e91198329b828eae1618c068dfb8ef17bb33287a7ad4b61ac314e"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b0cb791f5b45307caae8810c2023a184c74605ec3bcbb67d13846c28ff731ff8"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:688d3fd9fcb71f41c4c015c023d12a79d1c4c0732ec9eb35d96e3388a120dcf3"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d591580c34f4d731592f0e9fe40f9cc1b430d297eecc70b962e93c5c668f15f"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:82f986faf4e644ffc189a7f1aafc86e46ef70372bb153e7001e8afccc6e54133"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:bec317a27290e2537f922639cafd54990551725fc844249e64c523301d0822fc"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:0296abcb83a797db256b773f45773da397da75a08f5fcaef41f2044adec05f50"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0d75070718e369e452075a6017fbf187f788e17ed67a3abd47fa934d001863d9"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7e17b560be3c98a8e3aa66ce828bdebb9e9ac6ad5466fba92eb74c4c95cb1151"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c33939a82924da9ed65dab5a65d427205a73181d8098e79b6b426bdf8ad4e656"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:00bad2484fa6bda1e216e7345a798bd37c68fb2d97558edd584942aa41b7d278"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c817e2b40aba42bac6f457498dacabc568c3b7a986fc9ba7c8d9d260b71485fb"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:251136cdad0cb722e93732cb45ca5299fb56e1344a833640bf93b2803f8d1bfd"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d2088237af596f0a524d3afc39ab3b036e8adb054ee57cbb1dcf8e09da5b29cc"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d4041c0b966a84b4ae7a09832eb691a35aec90910cd2dbe7a208de59be77965b"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:8083d4e875ebe0b864ffef72a4304827015cff328a1be6e22cc850753bfb122b"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f141ee28a0ad2123b6611b6ceff018039df17f32ada8b534e6aa039545a3efb2"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7d0c8399fcc1848491f00e0314bd59fb34a9c008761bcb422a057670c3f65e35"}, + {file = "pydantic_core-2.27.2.tar.gz", hash = "sha256:eb026e5a4c1fee05726072337ff51d1efb6f59090b7da90d30ea58625b1ffb39"}, ] [package.dependencies] @@ -3730,13 +3705,13 @@ diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pyright" -version = "1.1.390" +version = "1.1.391" description = "Command line wrapper for pyright" optional = false python-versions = ">=3.7" files = [ - {file = "pyright-1.1.390-py3-none-any.whl", hash = "sha256:ecebfba5b6b50af7c1a44c2ba144ba2ab542c227eb49bc1f16984ff714e0e110"}, - {file = "pyright-1.1.390.tar.gz", hash = "sha256:aad7f160c49e0fbf8209507a15e17b781f63a86a1facb69ca877c71ef2e9538d"}, + {file = "pyright-1.1.391-py3-none-any.whl", hash = "sha256:54fa186f8b3e8a55a44ebfa842636635688670c6896dcf6cf4a7fc75062f4d15"}, + {file = "pyright-1.1.391.tar.gz", hash = "sha256:66b2d42cdf5c3cbab05f2f4b76e8bec8aa78e679bfa0b6ad7b923d9e027cadb2"}, ] [package.dependencies] @@ -4421,29 +4396,29 @@ files = [ [[package]] name = "ruff" -version = "0.8.3" +version = "0.8.5" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.8.3-py3-none-linux_armv6l.whl", hash = "sha256:8d5d273ffffff0acd3db5bf626d4b131aa5a5ada1276126231c4174543ce20d6"}, - {file = "ruff-0.8.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:e4d66a21de39f15c9757d00c50c8cdd20ac84f55684ca56def7891a025d7e939"}, - {file = "ruff-0.8.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:c356e770811858bd20832af696ff6c7e884701115094f427b64b25093d6d932d"}, - {file = "ruff-0.8.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c0a60a825e3e177116c84009d5ebaa90cf40dfab56e1358d1df4e29a9a14b13"}, - {file = "ruff-0.8.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:75fb782f4db39501210ac093c79c3de581d306624575eddd7e4e13747e61ba18"}, - {file = "ruff-0.8.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7f26bc76a133ecb09a38b7868737eded6941b70a6d34ef53a4027e83913b6502"}, - {file = "ruff-0.8.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:01b14b2f72a37390c1b13477c1c02d53184f728be2f3ffc3ace5b44e9e87b90d"}, - {file = "ruff-0.8.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:53babd6e63e31f4e96ec95ea0d962298f9f0d9cc5990a1bbb023a6baf2503a82"}, - {file = "ruff-0.8.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1ae441ce4cf925b7f363d33cd6570c51435972d697e3e58928973994e56e1452"}, - {file = "ruff-0.8.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7c65bc0cadce32255e93c57d57ecc2cca23149edd52714c0c5d6fa11ec328cd"}, - {file = "ruff-0.8.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:5be450bb18f23f0edc5a4e5585c17a56ba88920d598f04a06bd9fd76d324cb20"}, - {file = "ruff-0.8.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:8faeae3827eaa77f5721f09b9472a18c749139c891dbc17f45e72d8f2ca1f8fc"}, - {file = "ruff-0.8.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:db503486e1cf074b9808403991663e4277f5c664d3fe237ee0d994d1305bb060"}, - {file = "ruff-0.8.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6567be9fb62fbd7a099209257fef4ad2c3153b60579818b31a23c886ed4147ea"}, - {file = "ruff-0.8.3-py3-none-win32.whl", hash = "sha256:19048f2f878f3ee4583fc6cb23fb636e48c2635e30fb2022b3a1cd293402f964"}, - {file = "ruff-0.8.3-py3-none-win_amd64.whl", hash = "sha256:f7df94f57d7418fa7c3ffb650757e0c2b96cf2501a0b192c18e4fb5571dfada9"}, - {file = "ruff-0.8.3-py3-none-win_arm64.whl", hash = "sha256:fe2756edf68ea79707c8d68b78ca9a58ed9af22e430430491ee03e718b5e4936"}, - {file = "ruff-0.8.3.tar.gz", hash = "sha256:5e7558304353b84279042fc584a4f4cb8a07ae79b2bf3da1a7551d960b5626d3"}, + {file = "ruff-0.8.5-py3-none-linux_armv6l.whl", hash = "sha256:5ad11a5e3868a73ca1fa4727fe7e33735ea78b416313f4368c504dbeb69c0f88"}, + {file = "ruff-0.8.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:f69ab37771ea7e0715fead8624ec42996d101269a96e31f4d31be6fc33aa19b7"}, + {file = "ruff-0.8.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b5462d7804558ccff9c08fe8cbf6c14b7efe67404316696a2dde48297b1925bb"}, + {file = "ruff-0.8.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d56de7220a35607f9fe59f8a6d018e14504f7b71d784d980835e20fc0611cd50"}, + {file = "ruff-0.8.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9d99cf80b0429cbebf31cbbf6f24f05a29706f0437c40413d950e67e2d4faca4"}, + {file = "ruff-0.8.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b75ac29715ac60d554a049dbb0ef3b55259076181c3369d79466cb130eb5afd"}, + {file = "ruff-0.8.5-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c9d526a62c9eda211b38463528768fd0ada25dad524cb33c0e99fcff1c67b5dc"}, + {file = "ruff-0.8.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:587c5e95007612c26509f30acc506c874dab4c4abbacd0357400bd1aa799931b"}, + {file = "ruff-0.8.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:622b82bf3429ff0e346835ec213aec0a04d9730480cbffbb6ad9372014e31bbd"}, + {file = "ruff-0.8.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f99be814d77a5dac8a8957104bdd8c359e85c86b0ee0e38dca447cb1095f70fb"}, + {file = "ruff-0.8.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:c01c048f9c3385e0fd7822ad0fd519afb282af9cf1778f3580e540629df89725"}, + {file = "ruff-0.8.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:7512e8cb038db7f5db6aae0e24735ff9ea03bb0ed6ae2ce534e9baa23c1dc9ea"}, + {file = "ruff-0.8.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:762f113232acd5b768d6b875d16aad6b00082add40ec91c927f0673a8ec4ede8"}, + {file = "ruff-0.8.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:03a90200c5dfff49e4c967b405f27fdfa81594cbb7c5ff5609e42d7fe9680da5"}, + {file = "ruff-0.8.5-py3-none-win32.whl", hash = "sha256:8710ffd57bdaa6690cbf6ecff19884b8629ec2a2a2a2f783aa94b1cc795139ed"}, + {file = "ruff-0.8.5-py3-none-win_amd64.whl", hash = "sha256:4020d8bf8d3a32325c77af452a9976a9ad6455773bcb94991cf15bd66b347e47"}, + {file = "ruff-0.8.5-py3-none-win_arm64.whl", hash = "sha256:134ae019ef13e1b060ab7136e7828a6d83ea727ba123381307eb37c6bd5e01cb"}, + {file = "ruff-0.8.5.tar.gz", hash = "sha256:1098d36f69831f7ff2a1da3e6407d5fbd6dfa2559e4f74ff2d260c5588900317"}, ] [[package]] @@ -4639,13 +4614,13 @@ files = [ [[package]] name = "smart-open" -version = "7.0.5" +version = "7.1.0" description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)" optional = false python-versions = "<4.0,>=3.7" files = [ - {file = "smart_open-7.0.5-py3-none-any.whl", hash = "sha256:8523ed805c12dff3eaa50e9c903a6cb0ae78800626631c5fe7ea073439847b89"}, - {file = "smart_open-7.0.5.tar.gz", hash = "sha256:d3672003b1dbc85e2013e4983b88eb9a5ccfd389b0d4e5015f39a9ee5620ec18"}, + {file = "smart_open-7.1.0-py3-none-any.whl", hash = "sha256:4b8489bb6058196258bafe901730c7db0dcf4f083f316e97269c66f45502055b"}, + {file = "smart_open-7.1.0.tar.gz", hash = "sha256:a4f09f84f0f6d3637c6543aca7b5487438877a21360e7368ccf1f704789752ba"}, ] [package.dependencies] @@ -5283,4 +5258,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "54d5f5d253d47c5c28c874c9aa56eaeba543fa3c27fed6143ae266b0a07ed391" +content-hash = "1adafa89f86e853b424eb1d66d3434520596e6b1e782c975a497a1c857ceabb9" diff --git a/pyproject.toml b/pyproject.toml index 7d27132548..97a9557a78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,6 @@ format-jinja = """ [tool.poetry.dependencies] python = ">=3.10,<3.13" environs = "^11.0.0" -datashaper = "^0.0.49" # Vector Stores azure-search-documents = "^11.5.2" @@ -252,7 +251,6 @@ ignore = [ [tool.ruff.lint.per-file-ignores] "tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"] -"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"] "graphrag/index/config/*" = ["TCH"] "*.ipynb" = ["T201"] @@ -264,7 +262,7 @@ convention = "numpy" # https://github.com/microsoft/pyright/blob/9f81564a4685ff5c55edd3959f9b39030f590b2f/docs/configuration.md#sample-pyprojecttoml-file [tool.pyright] -include = ["graphrag", "tests", "examples", "examples_notebooks"] +include = ["graphrag", "tests", "examples_notebooks"] exclude = ["**/node_modules", "**/__pycache__"] [tool.pytest.ini_options] diff --git a/tests/smoke/test_fixtures.py b/tests/smoke/test_fixtures.py index af6a5cd96d..247e1a4ff7 100644 --- a/tests/smoke/test_fixtures.py +++ b/tests/smoke/test_fixtures.py @@ -274,7 +274,7 @@ def test_fixture( workflow_config: dict[str, dict[str, Any]], query_config: list[dict[str, str]], ): - if workflow_config.get("skip", False): + if workflow_config.get("skip"): print(f"skipping smoke test {input_path})") return @@ -291,7 +291,7 @@ def test_fixture( if dispose is not None: dispose() - if not workflow_config.get("skip_assert", False): + if not workflow_config.get("skip_assert"): print("performing dataset assertions") self.__assert_indexer_outputs(root, workflow_config) diff --git a/tests/unit/config/test_default_config.py b/tests/unit/config/test_default_config.py index ce2282d634..f1507c65b3 100644 --- a/tests/unit/config/test_default_config.py +++ b/tests/unit/config/test_default_config.py @@ -5,55 +5,19 @@ import re import unittest from pathlib import Path -from typing import Any, cast from unittest import mock import pytest import yaml -from pydantic import ValidationError import graphrag.config.defaults as defs from graphrag.config.create_graphrag_config import create_graphrag_config -from graphrag.config.enums import ( - CacheType, - InputFileType, - InputType, - ReportingType, - StorageType, -) from graphrag.config.errors import ( ApiKeyMissingError, AzureApiBaseMissingError, AzureDeploymentNameMissingError, ) -from graphrag.config.input_models.cache_config_input import CacheConfigInput -from graphrag.config.input_models.chunking_config_input import ChunkingConfigInput -from graphrag.config.input_models.claim_extraction_config_input import ( - ClaimExtractionConfigInput, -) -from graphrag.config.input_models.cluster_graph_config_input import ( - ClusterGraphConfigInput, -) -from graphrag.config.input_models.community_reports_config_input import ( - CommunityReportsConfigInput, -) -from graphrag.config.input_models.embed_graph_config_input import EmbedGraphConfigInput -from graphrag.config.input_models.entity_extraction_config_input import ( - EntityExtractionConfigInput, -) -from graphrag.config.input_models.graphrag_config_input import GraphRagConfigInput -from graphrag.config.input_models.input_config_input import InputConfigInput -from graphrag.config.input_models.llm_parameters_input import LLMParametersInput -from graphrag.config.input_models.reporting_config_input import ReportingConfigInput -from graphrag.config.input_models.snapshots_config_input import SnapshotsConfigInput -from graphrag.config.input_models.storage_config_input import StorageConfigInput -from graphrag.config.input_models.summarize_descriptions_config_input import ( - SummarizeDescriptionsConfigInput, -) -from graphrag.config.input_models.text_embedding_config_input import ( - TextEmbeddingConfigInput, -) -from graphrag.config.input_models.umap_config_input import UmapConfigInput +from graphrag.config.models.basic_search_config import BasicSearchConfig from graphrag.config.models.cache_config import CacheConfig from graphrag.config.models.chunking_config import ChunkingConfig from graphrag.config.models.claim_extraction_config import ClaimExtractionConfig @@ -82,10 +46,6 @@ PipelineInputConfig, PipelineTextInputConfig, ) -from graphrag.index.config.pipeline import ( - PipelineConfig, - PipelineWorkflowReference, -) from graphrag.index.config.reporting import PipelineFileReportingConfig from graphrag.index.config.storage import PipelineFileStorageConfig from graphrag.index.create_pipeline_config import create_pipeline_config @@ -229,6 +189,7 @@ def test_clear_warnings(self): assert InputConfig is not None assert LLMParameters is not None assert LocalSearchConfig is not None + assert BasicSearchConfig is not None assert ParallelizationParameters is not None assert ReportingConfig is not None assert SnapshotsConfig is not None @@ -236,12 +197,10 @@ def test_clear_warnings(self): assert SummarizeDescriptionsConfig is not None assert TextEmbeddingConfig is not None assert UmapConfig is not None - assert PipelineConfig is not None assert PipelineFileReportingConfig is not None assert PipelineFileStorageConfig is not None assert PipelineInputConfig is not None assert PipelineFileCacheConfig is not None - assert PipelineWorkflowReference is not None @mock.patch.dict(os.environ, {"OPENAI_API_KEY": "test"}, clear=True) def test_string_repr(self): @@ -253,47 +212,31 @@ def test_string_repr(self): # __repr__ can be eval()'d repr_str = config.__repr__() - # TODO: add __repr__ to datashaper enum + # TODO: add __repr__ to enum repr_str = repr_str.replace("async_mode=,", "") assert eval(repr_str) is not None - # Pipeline config __str__ can be json loaded - pipeline_config = create_pipeline_config(config) - string_repr = str(pipeline_config) - assert string_repr is not None - assert json.loads(string_repr) is not None - - # Pipeline config __repr__ can be eval()'d - repr_str = pipeline_config.__repr__() - # TODO: add __repr__ to datashaper enum - repr_str = repr_str.replace( - "'async_mode': ,", "" - ) - assert eval(repr_str) is not None - @mock.patch.dict(os.environ, {}, clear=True) def test_default_config_with_no_env_vars_throws(self): with pytest.raises(ApiKeyMissingError): # This should throw an error because the API key is missing - create_pipeline_config(create_graphrag_config()) + create_graphrag_config() @mock.patch.dict(os.environ, {"GRAPHRAG_API_KEY": "test"}, clear=True) def test_default_config_with_api_key_passes(self): # doesn't throw - config = create_pipeline_config(create_graphrag_config()) + config = create_graphrag_config() assert config is not None @mock.patch.dict(os.environ, {"OPENAI_API_KEY": "test"}, clear=True) def test_default_config_with_oai_key_passes_envvar(self): # doesn't throw - config = create_pipeline_config(create_graphrag_config()) + config = create_graphrag_config() assert config is not None def test_default_config_with_oai_key_passes_obj(self): # doesn't throw - config = create_pipeline_config( - create_graphrag_config({"llm": {"api_key": "test"}}) - ) + config = create_graphrag_config({"llm": {"api_key": "test"}}) assert config is not None @mock.patch.dict( @@ -305,13 +248,6 @@ def test_throws_if_azure_is_used_without_api_base_envvar(self): with pytest.raises(AzureApiBaseMissingError): create_graphrag_config() - @mock.patch.dict(os.environ, {"GRAPHRAG_API_KEY": "test"}, clear=True) - def test_throws_if_azure_is_used_without_api_base_obj(self): - with pytest.raises(AzureApiBaseMissingError): - create_graphrag_config( - GraphRagConfigInput(llm=LLMParametersInput(type="azure_openai_chat")) - ) - @mock.patch.dict( os.environ, { @@ -325,17 +261,6 @@ def test_throws_if_azure_is_used_without_llm_deployment_name_envvar(self): with pytest.raises(AzureDeploymentNameMissingError): create_graphrag_config() - @mock.patch.dict(os.environ, {"GRAPHRAG_API_KEY": "test"}, clear=True) - def test_throws_if_azure_is_used_without_llm_deployment_name_obj(self): - with pytest.raises(AzureDeploymentNameMissingError): - create_graphrag_config( - GraphRagConfigInput( - llm=LLMParametersInput( - type="azure_openai_chat", api_base="http://some/base" - ) - ) - ) - @mock.patch.dict( os.environ, { @@ -349,20 +274,6 @@ def test_throws_if_azure_is_used_without_embedding_api_base_envvar(self): with pytest.raises(AzureApiBaseMissingError): create_graphrag_config() - @mock.patch.dict(os.environ, {"GRAPHRAG_API_KEY": "test"}, clear=True) - def test_throws_if_azure_is_used_without_embedding_api_base_obj(self): - with pytest.raises(AzureApiBaseMissingError): - create_graphrag_config( - GraphRagConfigInput( - embeddings=TextEmbeddingConfigInput( - llm=LLMParametersInput( - type="azure_openai_embedding", - deployment_name="x", - ) - ), - ) - ) - @mock.patch.dict( os.environ, { @@ -378,43 +289,6 @@ def test_throws_if_azure_is_used_without_embedding_deployment_name_envvar(self): with pytest.raises(AzureDeploymentNameMissingError): create_graphrag_config() - @mock.patch.dict(os.environ, {"GRAPHRAG_API_KEY": "test"}, clear=True) - def test_throws_if_azure_is_used_without_embedding_deployment_name_obj(self): - with pytest.raises(AzureDeploymentNameMissingError): - create_graphrag_config( - GraphRagConfigInput( - llm=LLMParametersInput( - type="azure_openai_chat", - api_base="http://some/base", - deployment_name="model-deployment-name-x", - ), - embeddings=TextEmbeddingConfigInput( - llm=LLMParametersInput( - type="azure_openai_embedding", - ) - ), - ) - ) - - @mock.patch.dict(os.environ, {"GRAPHRAG_API_KEY": "test"}, clear=True) - def test_minimim_azure_config_object(self): - config = create_graphrag_config( - GraphRagConfigInput( - llm=LLMParametersInput( - type="azure_openai_chat", - api_base="http://some/base", - deployment_name="model-deployment-name-x", - ), - embeddings=TextEmbeddingConfigInput( - llm=LLMParametersInput( - type="azure_openai_embedding", - deployment_name="model-deployment-name", - ) - ), - ) - ) - assert config is not None - @mock.patch.dict( os.environ, { @@ -462,7 +336,7 @@ def test_throws_if_azure_is_used_without_embedding_deployment_name(self): clear=True, ) def test_csv_input_returns_correct_config(self): - config = create_pipeline_config(create_graphrag_config(root_dir="/some/root")) + config = create_graphrag_config(root_dir="/some/root") assert config.root_dir == "/some/root" # Make sure the input is a CSV input assert isinstance(config.input, PipelineCSVInputConfig) @@ -538,298 +412,11 @@ def find_envvar_names(text) -> set[str]: {"GRAPHRAG_API_KEY": "test"}, clear=True, ) - def test_malformed_input_dict_throws(self): - with pytest.raises(ValidationError): - create_graphrag_config(cast("Any", {"llm": 12})) - @mock.patch.dict( os.environ, ALL_ENV_VARS, clear=True, ) - def test_create_parameters_from_env_vars(self) -> None: - parameters = create_graphrag_config() - assert parameters.async_mode == "asyncio" - assert parameters.cache.storage_account_blob_url == "cache_account_blob_url" - assert parameters.cache.base_dir == "/some/cache/dir" - assert parameters.cache.connection_string == "test_cs1" - assert parameters.cache.container_name == "test_cn1" - assert parameters.cache.type == CacheType.blob - assert parameters.chunks.group_by_columns == ["a", "b"] - assert parameters.chunks.overlap == 12 - assert parameters.chunks.size == 500 - assert parameters.chunks.encoding_model == "encoding-c" - assert parameters.claim_extraction.enabled - assert parameters.claim_extraction.description == "test 123" - assert parameters.claim_extraction.max_gleanings == 5000 - assert parameters.claim_extraction.prompt == "tests/unit/config/prompt-a.txt" - assert parameters.claim_extraction.encoding_model == "encoding_a" - assert parameters.cluster_graph.max_cluster_size == 123 - assert parameters.community_reports.max_length == 23456 - assert parameters.community_reports.prompt == "tests/unit/config/prompt-b.txt" - assert parameters.embed_graph.enabled - assert parameters.embed_graph.iterations == 878787 - assert parameters.embed_graph.num_walks == 5_000_000 - assert parameters.embed_graph.random_seed == 10101 - assert parameters.embed_graph.walk_length == 555111 - assert parameters.embed_graph.window_size == 12345 - assert parameters.embeddings.batch_max_tokens == 17 - assert parameters.embeddings.batch_size == 1_000_000 - assert parameters.embeddings.llm.concurrent_requests == 12 - assert parameters.embeddings.llm.deployment_name == "model-deployment-name" - assert parameters.embeddings.llm.max_retries == 3 - assert parameters.embeddings.llm.max_retry_wait == 0.1123 - assert parameters.embeddings.llm.model == "text-embedding-2" - assert parameters.embeddings.llm.requests_per_minute == 500 - assert parameters.embeddings.llm.sleep_on_rate_limit_recommendation is False - assert parameters.embeddings.llm.tokens_per_minute == 7000 - assert parameters.embeddings.llm.type == "azure_openai_embedding" - assert parameters.embeddings.parallelization.num_threads == 2345 - assert parameters.embeddings.parallelization.stagger == 0.456 - assert parameters.embeddings.skip == ["a1", "b1", "c1"] - assert parameters.embeddings.target == "all" - assert parameters.encoding_model == "test123" - assert parameters.entity_extraction.entity_types == ["cat", "dog", "elephant"] - assert parameters.entity_extraction.llm.api_base == "http://some/base" - assert parameters.entity_extraction.max_gleanings == 112 - assert parameters.entity_extraction.prompt == "tests/unit/config/prompt-c.txt" - assert parameters.entity_extraction.encoding_model == "encoding_b" - assert parameters.input.storage_account_blob_url == "input_account_blob_url" - assert parameters.input.base_dir == "/some/input/dir" - assert parameters.input.connection_string == "input_cs" - assert parameters.input.container_name == "input_cn" - assert parameters.input.document_attribute_columns == ["test1", "test2"] - assert parameters.input.encoding == "utf-16" - assert parameters.input.file_pattern == ".*\\test\\.txt$" - assert parameters.input.file_type == InputFileType.text - assert parameters.input.source_column == "test_source" - assert parameters.input.text_column == "test_text" - assert parameters.input.timestamp_column == "test_timestamp" - assert parameters.input.timestamp_format == "test_format" - assert parameters.input.title_column == "test_title" - assert parameters.input.type == InputType.blob - assert parameters.llm.api_base == "http://some/base" - assert parameters.llm.api_key == "test" - assert parameters.llm.api_version == "v1234" - assert parameters.llm.concurrent_requests == 12 - assert parameters.llm.deployment_name == "model-deployment-name-x" - assert parameters.llm.max_retries == 312 - assert parameters.llm.max_retry_wait == 0.1122 - assert parameters.llm.max_tokens == 15000 - assert parameters.llm.model == "test-llm" - assert parameters.llm.model_supports_json - assert parameters.llm.n == 1 - assert parameters.llm.organization == "test_org" - assert parameters.llm.proxy == "http://some/proxy" - assert parameters.llm.request_timeout == 12.7 - assert parameters.llm.requests_per_minute == 900 - assert parameters.llm.sleep_on_rate_limit_recommendation is False - assert parameters.llm.temperature == 0.0 - assert parameters.llm.top_p == 1.0 - assert parameters.llm.tokens_per_minute == 8000 - assert parameters.llm.type == "azure_openai_chat" - assert parameters.parallelization.num_threads == 987 - assert parameters.parallelization.stagger == 0.123 - assert ( - parameters.reporting.storage_account_blob_url - == "reporting_account_blob_url" - ) - assert parameters.reporting.base_dir == "/some/reporting/dir" - assert parameters.reporting.connection_string == "test_cs2" - assert parameters.reporting.container_name == "test_cn2" - assert parameters.reporting.type == ReportingType.blob - assert parameters.skip_workflows == ["a", "b", "c"] - assert parameters.snapshots.graphml - assert parameters.snapshots.embeddings - assert parameters.snapshots.transient - assert parameters.storage.storage_account_blob_url == "storage_account_blob_url" - assert parameters.storage.base_dir == "/some/storage/dir" - assert parameters.storage.connection_string == "test_cs" - assert parameters.storage.container_name == "test_cn" - assert parameters.storage.type == StorageType.blob - assert parameters.summarize_descriptions.max_length == 12345 - assert ( - parameters.summarize_descriptions.prompt == "tests/unit/config/prompt-d.txt" - ) - assert parameters.umap.enabled - assert parameters.local_search.text_unit_prop == 0.713 - assert parameters.local_search.community_prop == 0.1234 - assert parameters.local_search.llm_max_tokens == 12 - assert parameters.local_search.top_k_relationships == 15 - assert parameters.local_search.conversation_history_max_turns == 2 - assert parameters.local_search.top_k_entities == 14 - assert parameters.local_search.temperature == 0.1 - assert parameters.local_search.top_p == 0.9 - assert parameters.local_search.n == 2 - assert parameters.local_search.max_tokens == 142435 - - assert parameters.global_search.temperature == 0.1 - assert parameters.global_search.top_p == 0.9 - assert parameters.global_search.n == 2 - assert parameters.global_search.max_tokens == 5123 - assert parameters.global_search.data_max_tokens == 123 - assert parameters.global_search.map_max_tokens == 4123 - assert parameters.global_search.concurrency == 7 - assert parameters.global_search.reduce_max_tokens == 15432 - - @mock.patch.dict(os.environ, {"API_KEY_X": "test"}, clear=True) - def test_create_parameters(self) -> None: - parameters = create_graphrag_config( - GraphRagConfigInput( - llm=LLMParametersInput(api_key="${API_KEY_X}", model="test-llm"), - storage=StorageConfigInput( - type=StorageType.blob, - connection_string="test_cs", - container_name="test_cn", - base_dir="/some/storage/dir", - storage_account_blob_url="storage_account_blob_url", - ), - cache=CacheConfigInput( - type=CacheType.blob, - connection_string="test_cs1", - container_name="test_cn1", - base_dir="/some/cache/dir", - storage_account_blob_url="cache_account_blob_url", - ), - reporting=ReportingConfigInput( - type=ReportingType.blob, - connection_string="test_cs2", - container_name="test_cn2", - base_dir="/some/reporting/dir", - storage_account_blob_url="reporting_account_blob_url", - ), - input=InputConfigInput( - file_type=InputFileType.text, - file_encoding="utf-16", - document_attribute_columns=["test1", "test2"], - base_dir="/some/input/dir", - connection_string="input_cs", - container_name="input_cn", - file_pattern=".*\\test\\.txt$", - source_column="test_source", - text_column="test_text", - timestamp_column="test_timestamp", - timestamp_format="test_format", - title_column="test_title", - type="blob", - storage_account_blob_url="input_account_blob_url", - ), - embed_graph=EmbedGraphConfigInput( - enabled=True, - num_walks=5_000_000, - iterations=878787, - random_seed=10101, - walk_length=555111, - ), - embeddings=TextEmbeddingConfigInput( - batch_size=1_000_000, - batch_max_tokens=8000, - skip=["a1", "b1", "c1"], - llm=LLMParametersInput(model="text-embedding-2"), - ), - chunks=ChunkingConfigInput( - size=500, overlap=12, group_by_columns=["a", "b"] - ), - snapshots=SnapshotsConfigInput( - graphml=True, - embeddings=True, - transient=True, - ), - entity_extraction=EntityExtractionConfigInput( - max_gleanings=112, - entity_types=["cat", "dog", "elephant"], - prompt="entity_extraction_prompt_file.txt", - ), - summarize_descriptions=SummarizeDescriptionsConfigInput( - max_length=12345, prompt="summarize_prompt_file.txt" - ), - community_reports=CommunityReportsConfigInput( - max_length=23456, - prompt="community_report_prompt_file.txt", - max_input_length=12345, - ), - claim_extraction=ClaimExtractionConfigInput( - description="test 123", - max_gleanings=5000, - prompt="claim_extraction_prompt_file.txt", - ), - cluster_graph=ClusterGraphConfigInput( - max_cluster_size=123, - ), - umap=UmapConfigInput(enabled=True), - encoding_model="test123", - skip_workflows=["a", "b", "c"], - ), - ".", - ) - - assert parameters.cache.base_dir == "/some/cache/dir" - assert parameters.cache.connection_string == "test_cs1" - assert parameters.cache.container_name == "test_cn1" - assert parameters.cache.type == CacheType.blob - assert parameters.cache.storage_account_blob_url == "cache_account_blob_url" - assert parameters.chunks.group_by_columns == ["a", "b"] - assert parameters.chunks.overlap == 12 - assert parameters.chunks.size == 500 - assert parameters.claim_extraction.description == "test 123" - assert parameters.claim_extraction.max_gleanings == 5000 - assert parameters.claim_extraction.prompt == "claim_extraction_prompt_file.txt" - assert parameters.cluster_graph.max_cluster_size == 123 - assert parameters.community_reports.max_input_length == 12345 - assert parameters.community_reports.max_length == 23456 - assert parameters.community_reports.prompt == "community_report_prompt_file.txt" - assert parameters.embed_graph.enabled - assert parameters.embed_graph.iterations == 878787 - assert parameters.embed_graph.num_walks == 5_000_000 - assert parameters.embed_graph.random_seed == 10101 - assert parameters.embed_graph.walk_length == 555111 - assert parameters.embeddings.batch_max_tokens == 8000 - assert parameters.embeddings.batch_size == 1_000_000 - assert parameters.embeddings.llm.model == "text-embedding-2" - assert parameters.embeddings.skip == ["a1", "b1", "c1"] - assert parameters.encoding_model == "test123" - assert parameters.entity_extraction.entity_types == ["cat", "dog", "elephant"] - assert parameters.entity_extraction.max_gleanings == 112 - assert ( - parameters.entity_extraction.prompt == "entity_extraction_prompt_file.txt" - ) - assert parameters.input.base_dir == "/some/input/dir" - assert parameters.input.connection_string == "input_cs" - assert parameters.input.container_name == "input_cn" - assert parameters.input.document_attribute_columns == ["test1", "test2"] - assert parameters.input.encoding == "utf-16" - assert parameters.input.file_pattern == ".*\\test\\.txt$" - assert parameters.input.source_column == "test_source" - assert parameters.input.type == "blob" - assert parameters.input.text_column == "test_text" - assert parameters.input.timestamp_column == "test_timestamp" - assert parameters.input.timestamp_format == "test_format" - assert parameters.input.title_column == "test_title" - assert parameters.input.file_type == InputFileType.text - assert parameters.input.storage_account_blob_url == "input_account_blob_url" - assert parameters.llm.api_key == "test" - assert parameters.llm.model == "test-llm" - assert parameters.reporting.base_dir == "/some/reporting/dir" - assert parameters.reporting.connection_string == "test_cs2" - assert parameters.reporting.container_name == "test_cn2" - assert parameters.reporting.type == ReportingType.blob - assert ( - parameters.reporting.storage_account_blob_url - == "reporting_account_blob_url" - ) - assert parameters.skip_workflows == ["a", "b", "c"] - assert parameters.snapshots.graphml - assert parameters.snapshots.embeddings - assert parameters.snapshots.transient - assert parameters.storage.base_dir == "/some/storage/dir" - assert parameters.storage.connection_string == "test_cs" - assert parameters.storage.container_name == "test_cn" - assert parameters.storage.type == StorageType.blob - assert parameters.storage.storage_account_blob_url == "storage_account_blob_url" - assert parameters.summarize_descriptions.max_length == 12345 - assert parameters.summarize_descriptions.prompt == "summarize_prompt_file.txt" - assert parameters.umap.enabled - @mock.patch.dict( os.environ, {"GRAPHRAG_API_KEY": "test"}, diff --git a/tests/unit/indexing/config/__init__.py b/tests/unit/indexing/config/__init__.py deleted file mode 100644 index 0a3e38adfb..0000000000 --- a/tests/unit/indexing/config/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License diff --git a/tests/unit/indexing/config/default_config_with_everything_overridden.yml b/tests/unit/indexing/config/default_config_with_everything_overridden.yml deleted file mode 100644 index 7a2f712e46..0000000000 --- a/tests/unit/indexing/config/default_config_with_everything_overridden.yml +++ /dev/null @@ -1,20 +0,0 @@ -extends: default - -input: - file_type: text - base_dir: /some/overridden/dir - file_pattern: test.txt - -storage: - type: file - -cache: - type: file - -reporting: - type: file - -workflows: - - name: TEST_WORKFLOW - steps: - - verb: TEST_VERB diff --git a/tests/unit/indexing/config/default_config_with_overridden_input.yml b/tests/unit/indexing/config/default_config_with_overridden_input.yml deleted file mode 100644 index 68631a315a..0000000000 --- a/tests/unit/indexing/config/default_config_with_overridden_input.yml +++ /dev/null @@ -1,5 +0,0 @@ -extends: default -input: - file_type: text - base_dir: /some/overridden/dir - file_pattern: test.txt diff --git a/tests/unit/indexing/config/default_config_with_overridden_workflows.yml b/tests/unit/indexing/config/default_config_with_overridden_workflows.yml deleted file mode 100644 index c3c9d07c2c..0000000000 --- a/tests/unit/indexing/config/default_config_with_overridden_workflows.yml +++ /dev/null @@ -1,6 +0,0 @@ -extends: default - -workflows: - - name: TEST_WORKFLOW - steps: - - verb: TEST_VERB diff --git a/tests/unit/indexing/config/helpers.py b/tests/unit/indexing/config/helpers.py deleted file mode 100644 index f70b9af81e..0000000000 --- a/tests/unit/indexing/config/helpers.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -import json -import unittest -from typing import Any - -from graphrag.config.create_graphrag_config import create_graphrag_config -from graphrag.index.create_pipeline_config import PipelineConfig, create_pipeline_config - - -def assert_contains_default_config( - test_case: unittest.TestCase, - config: Any, - check_input=True, - check_storage=True, - check_reporting=True, - check_cache=True, - check_workflows=True, -): - """Asserts that the config contains the default config.""" - assert config is not None - assert isinstance(config, PipelineConfig) - - checked_config = json.loads( - config.model_dump_json(exclude_defaults=True, exclude_unset=True) - ) - - actual_default_config = json.loads( - create_pipeline_config(create_graphrag_config()).model_dump_json( - exclude_defaults=True, exclude_unset=True - ) - ) - props_to_ignore = ["root_dir", "extends"] - - # Make sure there is some sort of workflows - if not check_workflows: - props_to_ignore.append("workflows") - - # Make sure it tries to load some sort of input - if not check_input: - props_to_ignore.append("input") - - # Make sure it tries to load some sort of storage - if not check_storage: - props_to_ignore.append("storage") - - # Make sure it tries to load some sort of reporting - if not check_reporting: - props_to_ignore.append("reporting") - - # Make sure it tries to load some sort of cache - if not check_cache: - props_to_ignore.append("cache") - - for prop in props_to_ignore: - checked_config.pop(prop, None) - actual_default_config.pop(prop, None) - - assert actual_default_config == actual_default_config | checked_config diff --git a/tests/unit/indexing/config/test_load.py b/tests/unit/indexing/config/test_load.py deleted file mode 100644 index c458081ced..0000000000 --- a/tests/unit/indexing/config/test_load.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT Licenses -import json -import os -import unittest -from pathlib import Path -from typing import Any -from unittest import mock - -from graphrag.config.create_graphrag_config import create_graphrag_config -from graphrag.index.config.pipeline import PipelineConfig -from graphrag.index.create_pipeline_config import create_pipeline_config -from graphrag.index.load_pipeline_config import load_pipeline_config - -current_dir = os.path.dirname(__file__) - - -class TestLoadPipelineConfig(unittest.TestCase): - @mock.patch.dict(os.environ, {"GRAPHRAG_API_KEY": "test"}, clear=True) - def test_config_passed_in_returns_config(self): - config = PipelineConfig() - result = load_pipeline_config(config) - assert result == config - - @mock.patch.dict(os.environ, {"GRAPHRAG_API_KEY": "test"}, clear=True) - def test_loading_default_config_returns_config(self): - result = load_pipeline_config("default") - self.assert_is_default_config(result) - - @mock.patch.dict(os.environ, {"GRAPHRAG_API_KEY": "test"}, clear=True) - def test_loading_default_config_with_input_overridden(self): - config = load_pipeline_config( - str(Path(current_dir) / "default_config_with_overridden_input.yml") - ) - - # Check that the config is merged - # but skip checking the input - self.assert_is_default_config( - config, check_input=False, ignore_workflows=["create_base_text_units"] - ) - - if config.input is None: - msg = "Input should not be none" - raise Exception(msg) - - # Check that the input is merged - assert config.input.file_pattern == "test.txt" - assert config.input.file_type == "text" - assert config.input.base_dir == "/some/overridden/dir" - - @mock.patch.dict(os.environ, {"GRAPHRAG_API_KEY": "test"}, clear=True) - def test_loading_default_config_with_workflows_overridden(self): - config = load_pipeline_config( - str(Path(current_dir) / "default_config_with_overridden_workflows.yml") - ) - - # Check that the config is merged - # but skip checking the input - self.assert_is_default_config(config, check_workflows=False) - - # Make sure the workflows are overridden - assert len(config.workflows) == 1 - assert config.workflows[0].name == "TEST_WORKFLOW" - assert config.workflows[0].steps is not None - assert len(config.workflows[0].steps) == 1 # type: ignore - assert config.workflows[0].steps[0]["verb"] == "TEST_VERB" # type: ignore - - @mock.patch.dict(os.environ, {"GRAPHRAG_API_KEY": "test"}, clear=True) - def assert_is_default_config( - self, - config: Any, - check_input=True, - check_storage=True, - check_reporting=True, - check_cache=True, - check_workflows=True, - ignore_workflows=None, - ): - if ignore_workflows is None: - ignore_workflows = [] - assert config is not None - assert isinstance(config, PipelineConfig) - - checked_config = json.loads( - config.model_dump_json(exclude_defaults=True, exclude_unset=True) - ) - - actual_default_config = json.loads( - create_pipeline_config( - create_graphrag_config(root_dir=".") - ).model_dump_json(exclude_defaults=True, exclude_unset=True) - ) - props_to_ignore = ["root_dir", "extends"] - - # Make sure there is some sort of workflows - if not check_workflows: - props_to_ignore.append("workflows") - - # Make sure it tries to load some sort of input - if not check_input: - props_to_ignore.append("input") - - # Make sure it tries to load some sort of storage - if not check_storage: - props_to_ignore.append("storage") - - # Make sure it tries to load some sort of reporting - if not check_reporting: - props_to_ignore.append("reporting") - - # Make sure it tries to load some sort of cache - if not check_cache: - props_to_ignore.append("cache") - - for prop in props_to_ignore: - checked_config.pop(prop, None) - actual_default_config.pop(prop, None) - - for prop in actual_default_config: - if prop == "workflows": - assert len(checked_config[prop]) == len(actual_default_config[prop]) - for i, workflow in enumerate(actual_default_config[prop]): - if workflow["name"] not in ignore_workflows: - assert workflow == actual_default_config[prop][i] - else: - assert checked_config[prop] == actual_default_config[prop] - - def setUp(self) -> None: - os.environ["GRAPHRAG_OPENAI_API_KEY"] = "test" - os.environ["GRAPHRAG_OPENAI_EMBEDDING_API_KEY"] = "test" - return super().setUp() diff --git a/tests/unit/indexing/test_exports.py b/tests/unit/indexing/test_exports.py deleted file mode 100644 index ee2b23e622..0000000000 --- a/tests/unit/indexing/test_exports.py +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -from graphrag.index.create_pipeline_config import create_pipeline_config -from graphrag.index.run import run_pipeline, run_pipeline_with_config - - -def test_exported_functions(): - assert callable(create_pipeline_config) - assert callable(run_pipeline_with_config) - assert callable(run_pipeline) diff --git a/tests/unit/indexing/workflows/__init__.py b/tests/unit/indexing/workflows/__init__.py deleted file mode 100644 index 0a3e38adfb..0000000000 --- a/tests/unit/indexing/workflows/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License diff --git a/tests/unit/indexing/workflows/helpers.py b/tests/unit/indexing/workflows/helpers.py deleted file mode 100644 index 512e8294c2..0000000000 --- a/tests/unit/indexing/workflows/helpers.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -mock_verbs = { - "mock_verb": lambda x: x, - "mock_verb_2": lambda x: x, -} - -mock_workflows = { - "mock_workflow": lambda _x: [ - { - "verb": "mock_verb", - "args": { - "column": "test", - }, - } - ], - "mock_workflow_2": lambda _x: [ - { - "verb": "mock_verb", - "args": { - "column": "test", - }, - }, - { - "verb": "mock_verb_2", - "args": { - "column": "test", - }, - }, - ], -} diff --git a/tests/unit/indexing/workflows/test_export.py b/tests/unit/indexing/workflows/test_export.py deleted file mode 100644 index 206b4869e6..0000000000 --- a/tests/unit/indexing/workflows/test_export.py +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -from typing import Any, cast - -import pandas as pd -from datashaper import ( - Table, - VerbInput, - VerbResult, - create_verb_result, -) - -from graphrag.index.config.pipeline import PipelineWorkflowReference -from graphrag.index.run import run_pipeline -from graphrag.storage.memory_pipeline_storage import MemoryPipelineStorage -from graphrag.storage.pipeline_storage import PipelineStorage - - -async def mock_verb( - input: VerbInput, storage: PipelineStorage, **_kwargs -) -> VerbResult: - source = cast("pd.DataFrame", input.get_input()) - - output = source[["id"]] - - await storage.set("mock_write", source[["id"]]) - - return create_verb_result( - cast( - "Table", - output, - ) - ) - - -async def mock_no_return_verb( - input: VerbInput, storage: PipelineStorage, **_kwargs -) -> VerbResult: - source = cast("pd.DataFrame", input.get_input()) - - # write some outputs to storage independent of the return - await storage.set("empty_write", source[["name"]]) - - return create_verb_result( - cast( - "Table", - pd.DataFrame(), - ) - ) - - -async def test_normal_result_exports_parquet(): - mock_verbs: Any = {"mock_verb": mock_verb} - mock_workflows: Any = { - "mock_workflow": lambda _x: [ - { - "verb": "mock_verb", - "args": { - "column": "test", - }, - } - ] - } - workflows = [ - PipelineWorkflowReference( - name="mock_workflow", - config=None, - ) - ] - dataset = pd.DataFrame({"id": [1, 2, 3], "name": ["a", "b", "c"]}) - storage = MemoryPipelineStorage() - pipeline_result = [ - gen - async for gen in run_pipeline( - workflows, - dataset, - storage=storage, - additional_workflows=mock_workflows, - additional_verbs=mock_verbs, - ) - ] - - assert len(pipeline_result) == 1 - assert storage.keys() == ["stats.json", "mock_write", "mock_workflow.parquet"], ( - "Mock workflow output should be written to storage by the exporter when there is a non-empty data frame" - ) - - -async def test_empty_result_does_not_export_parquet(): - mock_verbs: Any = {"mock_no_return_verb": mock_no_return_verb} - mock_workflows: Any = { - "mock_workflow": lambda _x: [ - { - "verb": "mock_no_return_verb", - "args": { - "column": "test", - }, - } - ] - } - workflows = [ - PipelineWorkflowReference( - name="mock_workflow", - config=None, - ) - ] - dataset = pd.DataFrame({"id": [1, 2, 3], "name": ["a", "b", "c"]}) - storage = MemoryPipelineStorage() - pipeline_result = [ - gen - async for gen in run_pipeline( - workflows, - dataset, - storage=storage, - additional_workflows=mock_workflows, - additional_verbs=mock_verbs, - ) - ] - - assert len(pipeline_result) == 1 - assert storage.keys() == [ - "stats.json", - "empty_write", - ], "Mock workflow output should not be written to storage by the exporter" diff --git a/tests/unit/indexing/workflows/test_load.py b/tests/unit/indexing/workflows/test_load.py deleted file mode 100644 index 60ae6647b4..0000000000 --- a/tests/unit/indexing/workflows/test_load.py +++ /dev/null @@ -1,237 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -import unittest - -import pytest - -from graphrag.index.config.pipeline import PipelineWorkflowReference -from graphrag.index.errors import UnknownWorkflowError -from graphrag.index.workflows.load import create_workflow, load_workflows - -from .helpers import mock_verbs, mock_workflows - - -class TestCreateWorkflow(unittest.TestCase): - def test_workflow_with_steps_should_not_fail(self): - create_workflow( - "workflow_with_steps", - [ - { - "verb": "mock_verb", - "args": { - "column": "test", - }, - } - ], - config=None, - additional_verbs=mock_verbs, - ) - - def test_non_existent_workflow_without_steps_should_crash(self): - # since we don't have a workflow named "test", and the user didn't provide any steps, we should crash - # since we don't know what to do - with pytest.raises(UnknownWorkflowError): - create_workflow("test", None, config=None, additional_verbs=mock_verbs) - - def test_existing_workflow_should_not_crash(self): - create_workflow( - "mock_workflow", - None, - config=None, - additional_verbs=mock_verbs, - additional_workflows=mock_workflows, - ) - - -class TestLoadWorkflows(unittest.TestCase): - def test_non_existent_workflow_should_crash(self): - with pytest.raises(UnknownWorkflowError): - load_workflows( - [ - PipelineWorkflowReference( - name="some_workflow_that_does_not_exist", - config=None, - ) - ], - additional_workflows=mock_workflows, - additional_verbs=mock_verbs, - ) - - def test_single_workflow_should_not_crash(self): - load_workflows( - [ - PipelineWorkflowReference( - name="mock_workflow", - config=None, - ) - ], - additional_workflows=mock_workflows, - additional_verbs=mock_verbs, - ) - - def test_multiple_workflows_should_not_crash(self): - load_workflows( - [ - PipelineWorkflowReference( - name="mock_workflow", - config=None, - ), - PipelineWorkflowReference( - name="mock_workflow_2", - config=None, - ), - ], - # the two above are in the "mock_workflows" list - additional_workflows=mock_workflows, - additional_verbs=mock_verbs, - ) - - def test_two_interdependent_workflows_should_provide_correct_order(self): - ordered_workflows, _deps = load_workflows( - [ - PipelineWorkflowReference( - name="interdependent_workflow_1", - steps=[ - { - "verb": "mock_verb", - "args": { - "column": "test", - }, - "input": { - "source": "workflow:interdependent_workflow_2" - }, # This one is dependent on the second one, so when it comes out of load_workflows, it should be first - } - ], - ), - PipelineWorkflowReference( - name="interdependent_workflow_2", - steps=[ - { - "verb": "mock_verb", - "args": { - "column": "test", - }, - } - ], - ), - ], - # the two above are in the "mock_workflows" list - additional_workflows=mock_workflows, - additional_verbs=mock_verbs, - ) - - # two should only come out - assert len(ordered_workflows) == 2 - assert ordered_workflows[0].workflow.name == "interdependent_workflow_2" - assert ordered_workflows[1].workflow.name == "interdependent_workflow_1" - - def test_three_interdependent_workflows_should_provide_correct_order(self): - ordered_workflows, _deps = load_workflows( - [ - PipelineWorkflowReference( - name="interdependent_workflow_3", - steps=[ - { - "verb": "mock_verb", - "args": { - "column": "test", - }, - } - ], - ), - PipelineWorkflowReference( - name="interdependent_workflow_1", - steps=[ - { - "verb": "mock_verb", - "args": { - "column": "test", - }, - "input": {"source": "workflow:interdependent_workflow_2"}, - } - ], - ), - PipelineWorkflowReference( - name="interdependent_workflow_2", - steps=[ - { - "verb": "mock_verb", - "args": { - "column": "test", - }, - "input": {"source": "workflow:interdependent_workflow_3"}, - } - ], - ), - ], - # the two above are in the "mock_workflows" list - additional_workflows=mock_workflows, - additional_verbs=mock_verbs, - ) - - order = [ - "interdependent_workflow_3", - "interdependent_workflow_2", - "interdependent_workflow_1", - ] - assert [x.workflow.name for x in ordered_workflows] == order - - def test_two_workflows_dependent_on_another_single_workflow_should_provide_correct_order( - self, - ): - ordered_workflows, _deps = load_workflows( - [ - # Workflows 1 and 2 are dependent on 3, so 3 should come out first - PipelineWorkflowReference( - name="interdependent_workflow_3", - steps=[ - { - "verb": "mock_verb", - "args": { - "column": "test", - }, - } - ], - ), - PipelineWorkflowReference( - name="interdependent_workflow_1", - steps=[ - { - "verb": "mock_verb", - "args": { - "column": "test", - }, - "input": {"source": "workflow:interdependent_workflow_3"}, - } - ], - ), - PipelineWorkflowReference( - name="interdependent_workflow_2", - steps=[ - { - "verb": "mock_verb", - "args": { - "column": "test", - }, - "input": {"source": "workflow:interdependent_workflow_3"}, - } - ], - ), - ], - # the two above are in the "mock_workflows" list - additional_workflows=mock_workflows, - additional_verbs=mock_verbs, - ) - - assert len(ordered_workflows) == 3 - assert ordered_workflows[0].workflow.name == "interdependent_workflow_3" - - # The order of the other two doesn't matter, but they need to be there - assert ordered_workflows[1].workflow.name in [ - "interdependent_workflow_1", - "interdependent_workflow_2", - ] - assert ordered_workflows[2].workflow.name in [ - "interdependent_workflow_1", - "interdependent_workflow_2", - ] diff --git a/tests/verbs/test_compute_communities.py b/tests/verbs/test_compute_communities.py index 1b23ef97b9..a460793e0b 100644 --- a/tests/verbs/test_compute_communities.py +++ b/tests/verbs/test_compute_communities.py @@ -1,34 +1,35 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -from graphrag.index.flows.compute_communities import ( - compute_communities, -) -from graphrag.index.workflows.v1.compute_communities import ( - workflow_name, -) +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks +from graphrag.config.create_graphrag_config import create_graphrag_config +from graphrag.index.workflows.compute_communities import run_workflow +from graphrag.utils.storage import load_table_from_storage from .util import ( compare_outputs, - get_config_for_workflow, + create_test_context, load_test_table, ) -def test_compute_communities(): - edges = load_test_table("base_relationship_edges") +async def test_compute_communities(): expected = load_test_table("base_communities") - config = get_config_for_workflow(workflow_name) - cluster_config = config["cluster_graph"] + context = await create_test_context( + storage=["base_relationship_edges"], + ) - actual = compute_communities( - edges, - cluster_config.max_cluster_size, - cluster_config.use_lcc, - cluster_config.seed, + config = create_graphrag_config() + + await run_workflow( + config, + context, + NoopVerbCallbacks(), ) + actual = await load_table_from_storage("base_communities", context.storage) + columns = list(expected.columns.values) compare_outputs(actual, expected, columns) assert len(actual.columns) == len(expected.columns) diff --git a/tests/verbs/test_create_base_text_units.py b/tests/verbs/test_create_base_text_units.py index cf1d267aa3..587db6549d 100644 --- a/tests/verbs/test_create_base_text_units.py +++ b/tests/verbs/test_create_base_text_units.py @@ -1,65 +1,33 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -from graphrag.index.run.utils import create_run_context -from graphrag.index.workflows.v1.create_base_text_units import ( - build_steps, - workflow_name, -) +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks +from graphrag.config.create_graphrag_config import create_graphrag_config +from graphrag.index.workflows.create_base_text_units import run_workflow, workflow_name +from graphrag.utils.storage import load_table_from_storage from .util import ( compare_outputs, - get_config_for_workflow, - get_workflow_output, - load_input_tables, + create_test_context, load_test_table, ) async def test_create_base_text_units(): - input_tables = load_input_tables(inputs=[]) expected = load_test_table(workflow_name) - context = create_run_context(None, None, None) + context = await create_test_context() - config = get_config_for_workflow(workflow_name) + config = create_graphrag_config() # test data was created with 4o, so we need to match the encoding for chunks to be identical - config["chunks"].encoding_model = "o200k_base" - - steps = build_steps(config) + config.chunks.encoding_model = "o200k_base" - await get_workflow_output( - input_tables, - { - "steps": steps, - }, + await run_workflow( + config, context, + NoopVerbCallbacks(), ) - actual = await context.runtime_storage.get("base_text_units") - compare_outputs(actual, expected) - - -async def test_create_base_text_units_with_snapshot(): - input_tables = load_input_tables(inputs=[]) - - context = create_run_context(None, None, None) - - config = get_config_for_workflow(workflow_name) - # test data was created with 4o, so we need to match the encoding for chunks to be identical - config["chunks"].encoding_model = "o200k_base" - config["snapshot_transient"] = True - - steps = build_steps(config) + actual = await load_table_from_storage(workflow_name, context.storage) - await get_workflow_output( - input_tables, - { - "steps": steps, - }, - context, - ) - - assert context.storage.keys() == ["create_base_text_units.parquet"], ( - "Text unit snapshot keys differ" - ) + compare_outputs(actual, expected) diff --git a/tests/verbs/test_create_final_communities.py b/tests/verbs/test_create_final_communities.py index b9f16f4c2b..07c9e9baa5 100644 --- a/tests/verbs/test_create_final_communities.py +++ b/tests/verbs/test_create_final_communities.py @@ -1,32 +1,42 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -from graphrag.index.flows.create_final_communities import ( - create_final_communities, -) -from graphrag.index.workflows.v1.create_final_communities import ( +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks +from graphrag.config.create_graphrag_config import create_graphrag_config +from graphrag.index.workflows.create_final_communities import ( + run_workflow, workflow_name, ) +from graphrag.utils.storage import load_table_from_storage from .util import ( compare_outputs, + create_test_context, load_test_table, ) -def test_create_final_communities(): - base_entity_nodes = load_test_table("base_entity_nodes") - base_relationship_edges = load_test_table("base_relationship_edges") - base_communities = load_test_table("base_communities") - +async def test_create_final_communities(): expected = load_test_table(workflow_name) - actual = create_final_communities( - base_entity_nodes=base_entity_nodes, - base_relationship_edges=base_relationship_edges, - base_communities=base_communities, + context = await create_test_context( + storage=[ + "base_entity_nodes", + "base_relationship_edges", + "base_communities", + ], + ) + + config = create_graphrag_config() + + await run_workflow( + config, + context, + NoopVerbCallbacks(), ) + actual = await load_table_from_storage(workflow_name, context.storage) + assert "period" in expected.columns assert "id" in expected.columns columns = list(expected.columns.values) diff --git a/tests/verbs/test_create_final_community_reports.py b/tests/verbs/test_create_final_community_reports.py index 85a6c3ee2b..896fe6e3cb 100644 --- a/tests/verbs/test_create_final_community_reports.py +++ b/tests/verbs/test_create_final_community_reports.py @@ -3,23 +3,24 @@ import pytest -from datashaper.errors import VerbParallelizationError +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks +from graphrag.config.create_graphrag_config import create_graphrag_config from graphrag.config.enums import LLMType from graphrag.index.operations.summarize_communities.community_reports_extractor.community_reports_extractor import ( CommunityReportResponse, FindingModel, ) -from graphrag.index.workflows.v1.create_final_community_reports import ( - build_steps, +from graphrag.index.run.derive_from_rows import ParallelizationError +from graphrag.index.workflows.create_final_community_reports import ( + run_workflow, workflow_name, ) +from graphrag.utils.storage import load_table_from_storage from .util import ( compare_outputs, - get_config_for_workflow, - get_workflow_output, - load_input_tables, + create_test_context, load_test_table, ) @@ -48,28 +49,32 @@ async def test_create_final_community_reports(): - input_tables = load_input_tables([ - "workflow:create_final_nodes", - "workflow:create_final_covariates", - "workflow:create_final_relationships", - "workflow:create_final_entities", - "workflow:create_final_communities", - ]) expected = load_test_table(workflow_name) - config = get_config_for_workflow(workflow_name) - - config["create_community_reports"]["strategy"]["llm"] = MOCK_LLM_CONFIG + context = await create_test_context( + storage=[ + "create_final_nodes", + "create_final_covariates", + "create_final_relationships", + "create_final_entities", + "create_final_communities", + ] + ) - steps = build_steps(config) + config = create_graphrag_config() + config.community_reports.strategy = { + "type": "graph_intelligence", + "llm": MOCK_LLM_CONFIG, + } - actual = await get_workflow_output( - input_tables, - { - "steps": steps, - }, + await run_workflow( + config, + context, + NoopVerbCallbacks(), ) + actual = await load_table_from_storage(workflow_name, context.storage) + assert len(actual.columns) == len(expected.columns) # only assert a couple of columns that are not mock - most of this table is LLM-generated @@ -81,25 +86,24 @@ async def test_create_final_community_reports(): async def test_create_final_community_reports_missing_llm_throws(): - input_tables = load_input_tables([ - "workflow:create_final_nodes", - "workflow:create_final_covariates", - "workflow:create_final_relationships", - "workflow:create_final_entities", - "workflow:create_final_communities", - ]) - - config = get_config_for_workflow(workflow_name) - - # deleting the llm config results in a default mock injection in run_graph_intelligence - del config["create_community_reports"]["strategy"]["llm"] - - steps = build_steps(config) - - with pytest.raises(VerbParallelizationError): - await get_workflow_output( - input_tables, - { - "steps": steps, - }, + context = await create_test_context( + storage=[ + "create_final_nodes", + "create_final_covariates", + "create_final_relationships", + "create_final_entities", + "create_final_communities", + ] + ) + + config = create_graphrag_config() + config.community_reports.strategy = { + "type": "graph_intelligence", + } + + with pytest.raises(ParallelizationError): + await run_workflow( + config, + context, + NoopVerbCallbacks(), ) diff --git a/tests/verbs/test_create_final_covariates.py b/tests/verbs/test_create_final_covariates.py index aecd3e7782..8236abd7bc 100644 --- a/tests/verbs/test_create_final_covariates.py +++ b/tests/verbs/test_create_final_covariates.py @@ -2,20 +2,20 @@ # Licensed under the MIT License import pytest -from datashaper.errors import VerbParallelizationError from pandas.testing import assert_series_equal +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks +from graphrag.config.create_graphrag_config import create_graphrag_config from graphrag.config.enums import LLMType -from graphrag.index.run.utils import create_run_context -from graphrag.index.workflows.v1.create_final_covariates import ( - build_steps, +from graphrag.index.run.derive_from_rows import ParallelizationError +from graphrag.index.workflows.create_final_covariates import ( + run_workflow, workflow_name, ) +from graphrag.utils.storage import load_table_from_storage from .util import ( - get_config_for_workflow, - get_workflow_output, - load_input_tables, + create_test_context, load_test_table, ) @@ -29,29 +29,27 @@ async def test_create_final_covariates(): - input_tables = load_input_tables(["workflow:create_base_text_units"]) + input = load_test_table("create_base_text_units") expected = load_test_table(workflow_name) - context = create_run_context(None, None, None) - await context.runtime_storage.set( - "base_text_units", input_tables["workflow:create_base_text_units"] + context = await create_test_context( + storage=["create_base_text_units"], ) - config = get_config_for_workflow(workflow_name) + config = create_graphrag_config() + config.claim_extraction.strategy = { + "type": "graph_intelligence", + "llm": MOCK_LLM_CONFIG, + "claim_description": "description", + } - config["claim_extract"]["strategy"]["llm"] = MOCK_LLM_CONFIG - - steps = build_steps(config) - - actual = await get_workflow_output( - input_tables, - { - "steps": steps, - }, + await run_workflow( + config, context, + NoopVerbCallbacks(), ) - input = input_tables["workflow:create_base_text_units"] + actual = await load_table_from_storage(workflow_name, context.storage) assert len(actual.columns) == len(expected.columns) # our mock only returns one covariate per text unit, so that's a 1:1 mapping versus the LLM-extracted content in the test data @@ -83,24 +81,19 @@ async def test_create_final_covariates(): async def test_create_final_covariates_missing_llm_throws(): - input_tables = load_input_tables(["workflow:create_base_text_units"]) - - context = create_run_context(None, None, None) - await context.runtime_storage.set( - "base_text_units", input_tables["workflow:create_base_text_units"] + context = await create_test_context( + storage=["create_base_text_units"], ) - config = get_config_for_workflow(workflow_name) - - del config["claim_extract"]["strategy"]["llm"] - - steps = build_steps(config) + config = create_graphrag_config() + config.claim_extraction.strategy = { + "type": "graph_intelligence", + "claim_description": "description", + } - with pytest.raises(VerbParallelizationError): - await get_workflow_output( - input_tables, - { - "steps": steps, - }, + with pytest.raises(ParallelizationError): + await run_workflow( + config, context, + NoopVerbCallbacks(), ) diff --git a/tests/verbs/test_create_final_documents.py b/tests/verbs/test_create_final_documents.py index f58b0e2721..a6916530a0 100644 --- a/tests/verbs/test_create_final_documents.py +++ b/tests/verbs/test_create_final_documents.py @@ -1,70 +1,59 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -from graphrag.index.run.utils import create_run_context -from graphrag.index.workflows.v1.create_final_documents import ( - build_steps, +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks +from graphrag.config.create_graphrag_config import create_graphrag_config +from graphrag.index.workflows.create_final_documents import ( + run_workflow, workflow_name, ) +from graphrag.utils.storage import load_table_from_storage from .util import ( compare_outputs, - get_config_for_workflow, - get_workflow_output, - load_input_tables, + create_test_context, load_test_table, ) async def test_create_final_documents(): - input_tables = load_input_tables([ - "workflow:create_base_text_units", - ]) expected = load_test_table(workflow_name) - context = create_run_context(None, None, None) - await context.runtime_storage.set( - "base_text_units", input_tables["workflow:create_base_text_units"] + context = await create_test_context( + storage=["create_base_text_units"], ) - config = get_config_for_workflow(workflow_name) + config = create_graphrag_config() - steps = build_steps(config) - - actual = await get_workflow_output( - input_tables, - { - "steps": steps, - }, - context=context, + await run_workflow( + config, + context, + NoopVerbCallbacks(), ) + actual = await load_table_from_storage(workflow_name, context.storage) + compare_outputs(actual, expected) async def test_create_final_documents_with_attribute_columns(): - input_tables = load_input_tables(["workflow:create_base_text_units"]) expected = load_test_table(workflow_name) - context = create_run_context(None, None, None) - await context.runtime_storage.set( - "base_text_units", input_tables["workflow:create_base_text_units"] + context = await create_test_context( + storage=["create_base_text_units"], ) - config = get_config_for_workflow(workflow_name) - - config["document_attribute_columns"] = ["title"] + config = create_graphrag_config() + config.input.document_attribute_columns = ["title"] - steps = build_steps(config) - - actual = await get_workflow_output( - input_tables, - { - "steps": steps, - }, - context=context, + await run_workflow( + config, + context, + NoopVerbCallbacks(), ) + actual = await load_table_from_storage(workflow_name, context.storage) + # we should have dropped "title" and added "attributes" # our test dataframe does not have attributes, so we'll assert without it # and separately confirm it is in the output diff --git a/tests/verbs/test_create_final_entities.py b/tests/verbs/test_create_final_entities.py index 491830205f..6d4430d398 100644 --- a/tests/verbs/test_create_final_entities.py +++ b/tests/verbs/test_create_final_entities.py @@ -1,24 +1,37 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -from graphrag.index.flows.create_final_entities import ( - create_final_entities, -) -from graphrag.index.workflows.v1.create_final_entities import ( +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks +from graphrag.config.create_graphrag_config import create_graphrag_config +from graphrag.index.workflows.create_final_entities import ( + run_workflow, workflow_name, ) +from graphrag.utils.storage import load_table_from_storage from .util import ( compare_outputs, + create_test_context, load_test_table, ) -def test_create_final_entities(): - input = load_test_table("base_entity_nodes") +async def test_create_final_entities(): expected = load_test_table(workflow_name) - actual = create_final_entities(input) + context = await create_test_context( + storage=["base_entity_nodes"], + ) + + config = create_graphrag_config() + + await run_workflow( + config, + context, + NoopVerbCallbacks(), + ) + + actual = await load_table_from_storage(workflow_name, context.storage) compare_outputs(actual, expected) assert len(actual.columns) == len(expected.columns) diff --git a/tests/verbs/test_create_final_nodes.py b/tests/verbs/test_create_final_nodes.py index db3b6ec57f..f37cb20cec 100644 --- a/tests/verbs/test_create_final_nodes.py +++ b/tests/verbs/test_create_final_nodes.py @@ -1,39 +1,42 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -from datashaper import NoopVerbCallbacks - -from graphrag.config.models.embed_graph_config import EmbedGraphConfig -from graphrag.index.flows.create_final_nodes import ( - create_final_nodes, -) -from graphrag.index.workflows.v1.create_final_nodes import ( +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks +from graphrag.config.create_graphrag_config import create_graphrag_config +from graphrag.index.workflows.create_final_nodes import ( + run_workflow, workflow_name, ) +from graphrag.utils.storage import load_table_from_storage from .util import ( compare_outputs, + create_test_context, load_test_table, ) -def test_create_final_nodes(): - base_entity_nodes = load_test_table("base_entity_nodes") - base_relationship_edges = load_test_table("base_relationship_edges") - base_communities = load_test_table("base_communities") - +async def test_create_final_nodes(): expected = load_test_table(workflow_name) - embed_config = EmbedGraphConfig(enabled=False) - actual = create_final_nodes( - base_entity_nodes=base_entity_nodes, - base_relationship_edges=base_relationship_edges, - base_communities=base_communities, - callbacks=NoopVerbCallbacks(), - embed_config=embed_config, - layout_enabled=False, + context = await create_test_context( + storage=[ + "base_entity_nodes", + "base_relationship_edges", + "base_communities", + ], ) + config = create_graphrag_config() + + await run_workflow( + config, + context, + NoopVerbCallbacks(), + ) + + actual = await load_table_from_storage(workflow_name, context.storage) + assert "id" in expected.columns columns = list(expected.columns.values) columns.remove("id") diff --git a/tests/verbs/test_create_final_relationships.py b/tests/verbs/test_create_final_relationships.py index 9f01e08304..223ca20ea4 100644 --- a/tests/verbs/test_create_final_relationships.py +++ b/tests/verbs/test_create_final_relationships.py @@ -1,24 +1,38 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -from graphrag.index.flows.create_final_relationships import ( - create_final_relationships, -) -from graphrag.index.workflows.v1.create_final_relationships import ( + +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks +from graphrag.config.create_graphrag_config import create_graphrag_config +from graphrag.index.workflows.create_final_relationships import ( + run_workflow, workflow_name, ) +from graphrag.utils.storage import load_table_from_storage from .util import ( compare_outputs, + create_test_context, load_test_table, ) -def test_create_final_relationships(): - edges = load_test_table("base_relationship_edges") +async def test_create_final_relationships(): expected = load_test_table(workflow_name) - actual = create_final_relationships(edges) + context = await create_test_context( + storage=["base_relationship_edges"], + ) + + config = create_graphrag_config() + + await run_workflow( + config, + context, + NoopVerbCallbacks(), + ) + + actual = await load_table_from_storage(workflow_name, context.storage) assert "id" in expected.columns columns = list(expected.columns.values) diff --git a/tests/verbs/test_create_final_text_units.py b/tests/verbs/test_create_final_text_units.py index b87e61f55d..19fb11c6f0 100644 --- a/tests/verbs/test_create_final_text_units.py +++ b/tests/verbs/test_create_final_text_units.py @@ -1,80 +1,70 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -from graphrag.index.run.utils import create_run_context -from graphrag.index.workflows.v1.create_final_text_units import ( - build_steps, +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks +from graphrag.config.create_graphrag_config import create_graphrag_config +from graphrag.index.workflows.create_final_text_units import ( + run_workflow, workflow_name, ) +from graphrag.utils.storage import load_table_from_storage from .util import ( compare_outputs, - get_config_for_workflow, - get_workflow_output, - load_input_tables, + create_test_context, load_test_table, ) async def test_create_final_text_units(): - input_tables = load_input_tables([ - "workflow:create_base_text_units", - "workflow:create_final_entities", - "workflow:create_final_relationships", - "workflow:create_final_covariates", - ]) expected = load_test_table(workflow_name) - context = create_run_context(None, None, None) - await context.runtime_storage.set( - "base_text_units", input_tables["workflow:create_base_text_units"] + context = await create_test_context( + storage=[ + "create_base_text_units", + "create_final_entities", + "create_final_relationships", + "create_final_covariates", + ], ) - config = get_config_for_workflow(workflow_name) + config = create_graphrag_config() + config.claim_extraction.enabled = True - config["covariates_enabled"] = True - - steps = build_steps(config) - - actual = await get_workflow_output( - input_tables, - { - "steps": steps, - }, - context=context, + await run_workflow( + config, + context, + NoopVerbCallbacks(), ) + actual = await load_table_from_storage(workflow_name, context.storage) + compare_outputs(actual, expected) async def test_create_final_text_units_no_covariates(): - input_tables = load_input_tables([ - "workflow:create_base_text_units", - "workflow:create_final_entities", - "workflow:create_final_relationships", - "workflow:create_final_covariates", - ]) expected = load_test_table(workflow_name) - context = create_run_context(None, None, None) - await context.runtime_storage.set( - "base_text_units", input_tables["workflow:create_base_text_units"] + context = await create_test_context( + storage=[ + "create_base_text_units", + "create_final_entities", + "create_final_relationships", + "create_final_covariates", + ], ) - config = get_config_for_workflow(workflow_name) + config = create_graphrag_config() + config.claim_extraction.enabled = False - config["covariates_enabled"] = False - - steps = build_steps(config) - - actual = await get_workflow_output( - input_tables, - { - "steps": steps, - }, - context=context, + await run_workflow( + config, + context, + NoopVerbCallbacks(), ) + actual = await load_table_from_storage(workflow_name, context.storage) + # we're short a covariate_ids column columns = list(expected.columns.values) columns.remove("covariate_ids") diff --git a/tests/verbs/test_extract_graph.py b/tests/verbs/test_extract_graph.py index 3ccc1d22b6..68c9bb231b 100644 --- a/tests/verbs/test_extract_graph.py +++ b/tests/verbs/test_extract_graph.py @@ -3,17 +3,16 @@ import pytest +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks +from graphrag.config.create_graphrag_config import create_graphrag_config from graphrag.config.enums import LLMType -from graphrag.index.run.utils import create_run_context -from graphrag.index.workflows.v1.extract_graph import ( - build_steps, - workflow_name, +from graphrag.index.workflows.extract_graph import ( + run_workflow, ) +from graphrag.utils.storage import load_table_from_storage from .util import ( - get_config_for_workflow, - get_workflow_output, - load_input_tables, + create_test_context, load_test_table, ) @@ -49,35 +48,34 @@ async def test_extract_graph(): - input_tables = load_input_tables([ - "workflow:create_base_text_units", - ]) - nodes_expected = load_test_table("base_entity_nodes") edges_expected = load_test_table("base_relationship_edges") - context = create_run_context(None, None, None) - await context.runtime_storage.set( - "base_text_units", input_tables["workflow:create_base_text_units"] + context = await create_test_context( + storage=["create_base_text_units"], ) - config = get_config_for_workflow(workflow_name) - config["entity_extract"]["strategy"]["llm"] = MOCK_LLM_ENTITY_CONFIG - config["summarize_descriptions"]["strategy"]["llm"] = MOCK_LLM_SUMMARIZATION_CONFIG - - steps = build_steps(config) - - await get_workflow_output( - input_tables, - { - "steps": steps, - }, - context=context, + config = create_graphrag_config() + config.entity_extraction.strategy = { + "type": "graph_intelligence", + "llm": MOCK_LLM_ENTITY_CONFIG, + } + config.summarize_descriptions.strategy = { + "type": "graph_intelligence", + "llm": MOCK_LLM_SUMMARIZATION_CONFIG, + } + + await run_workflow( + config, + context, + NoopVerbCallbacks(), ) # graph construction creates transient tables for nodes, edges, and communities - nodes_actual = await context.runtime_storage.get("base_entity_nodes") - edges_actual = await context.runtime_storage.get("base_relationship_edges") + nodes_actual = await load_table_from_storage("base_entity_nodes", context.storage) + edges_actual = await load_table_from_storage( + "base_relationship_edges", context.storage + ) assert len(nodes_actual.columns) == len(nodes_expected.columns), ( "Nodes dataframe columns differ" @@ -91,69 +89,26 @@ async def test_extract_graph(): # this is because the mock responses always result in a single description, which is returned verbatim rather than summarized # we need to update the mocking to provide somewhat unique graphs so a true merge happens # the assertion should grab a node and ensure the description matches the mock description, not the original as we are doing below - assert nodes_actual["description"].to_numpy()[0] == "Company_A is a test company" - assert len(context.storage.keys()) == 0, "Storage should be empty" - - -async def test_extract_graph_with_snapshots(): - input_tables = load_input_tables([ - "workflow:create_base_text_units", - ]) - - context = create_run_context(None, None, None) - await context.runtime_storage.set( - "base_text_units", input_tables["workflow:create_base_text_units"] - ) - - config = get_config_for_workflow(workflow_name) - - config["entity_extract"]["strategy"]["llm"] = MOCK_LLM_ENTITY_CONFIG - config["summarize_descriptions"]["strategy"]["llm"] = MOCK_LLM_SUMMARIZATION_CONFIG - config["snapshot_graphml"] = True - config["snapshot_transient"] = True - config["embed_graph_enabled"] = True # need this on in order to see the snapshot - - steps = build_steps(config) - - await get_workflow_output( - input_tables, - { - "steps": steps, - }, - context=context, - ) - - assert context.storage.keys() == [ - "graph.graphml", - "base_entity_nodes.parquet", - "base_relationship_edges.parquet", - ], "Graph snapshot keys differ" - async def test_extract_graph_missing_llm_throws(): - input_tables = load_input_tables([ - "workflow:create_base_text_units", - ]) - - context = create_run_context(None, None, None) - await context.runtime_storage.set( - "base_text_units", input_tables["workflow:create_base_text_units"] + context = await create_test_context( + storage=["create_base_text_units"], ) - config = get_config_for_workflow(workflow_name) - - config["entity_extract"]["strategy"]["llm"] = MOCK_LLM_ENTITY_CONFIG - del config["summarize_descriptions"]["strategy"]["llm"] - - steps = build_steps(config) + config = create_graphrag_config() + config.entity_extraction.strategy = { + "type": "graph_intelligence", + "llm": MOCK_LLM_ENTITY_CONFIG, + } + config.summarize_descriptions.strategy = { + "type": "graph_intelligence", + } with pytest.raises(ValueError): # noqa PT011 - await get_workflow_output( - input_tables, - { - "steps": steps, - }, - context=context, + await run_workflow( + config, + context, + NoopVerbCallbacks(), ) diff --git a/tests/verbs/test_generate_text_embeddings.py b/tests/verbs/test_generate_text_embeddings.py index c0919501d8..640284c7ca 100644 --- a/tests/verbs/test_generate_text_embeddings.py +++ b/tests/verbs/test_generate_text_embeddings.py @@ -1,53 +1,44 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -from io import BytesIO - -import pandas as pd - +from graphrag.callbacks.noop_verb_callbacks import NoopVerbCallbacks +from graphrag.config.create_graphrag_config import create_graphrag_config +from graphrag.config.enums import TextEmbeddingTarget from graphrag.index.config.embeddings import ( all_embeddings, ) -from graphrag.index.run.utils import create_run_context -from graphrag.index.workflows.v1.generate_text_embeddings import ( - build_steps, - workflow_name, +from graphrag.index.workflows.generate_text_embeddings import ( + run_workflow, ) +from graphrag.utils.storage import load_table_from_storage from .util import ( - get_config_for_workflow, - get_workflow_output, - load_input_tables, + create_test_context, ) async def test_generate_text_embeddings(): - input_tables = load_input_tables( - inputs=[ - "workflow:create_final_documents", - "workflow:create_final_relationships", - "workflow:create_final_text_units", - "workflow:create_final_entities", - "workflow:create_final_community_reports", + context = await create_test_context( + storage=[ + "create_final_documents", + "create_final_relationships", + "create_final_text_units", + "create_final_entities", + "create_final_community_reports", ] ) - context = create_run_context(None, None, None) - - config = get_config_for_workflow(workflow_name) - - config["text_embed"]["strategy"]["type"] = "mock" - config["snapshot_embeddings"] = True - config["embedded_fields"] = all_embeddings + config = create_graphrag_config() + config.embeddings.strategy = { + "type": "mock", + } + config.embeddings.target = TextEmbeddingTarget.all + config.snapshots.embeddings = True - steps = build_steps(config) - - await get_workflow_output( - input_tables, - { - "steps": steps, - }, + await run_workflow( + config, context, + NoopVerbCallbacks(), ) parquet_files = context.storage.keys() @@ -56,23 +47,19 @@ async def test_generate_text_embeddings(): assert f"embeddings.{field}.parquet" in parquet_files # entity description should always be here, let's assert its format - entity_description_embeddings_buffer = BytesIO( - await context.storage.get( - "embeddings.entity.description.parquet", as_bytes=True - ) - ) - entity_description_embeddings = pd.read_parquet( - entity_description_embeddings_buffer + entity_description_embeddings = await load_table_from_storage( + "embeddings.entity.description", context.storage ) + assert len(entity_description_embeddings.columns) == 2 assert "id" in entity_description_embeddings.columns assert "embedding" in entity_description_embeddings.columns # every other embedding is optional but we've turned them all on, so check a random one - document_text_embeddings_buffer = BytesIO( - await context.storage.get("embeddings.document.text.parquet", as_bytes=True) + document_text_embeddings = await load_table_from_storage( + "embeddings.document.text", context.storage ) - document_text_embeddings = pd.read_parquet(document_text_embeddings_buffer) + assert len(document_text_embeddings.columns) == 2 assert "id" in document_text_embeddings.columns assert "embedding" in document_text_embeddings.columns diff --git a/tests/verbs/util.py b/tests/verbs/util.py index 8c9cc990ef..91a9625893 100644 --- a/tests/verbs/util.py +++ b/tests/verbs/util.py @@ -1,35 +1,31 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -from typing import cast - import pandas as pd -from datashaper import Workflow from pandas.testing import assert_series_equal -from graphrag.config.create_graphrag_config import create_graphrag_config -from graphrag.index.config.workflow import PipelineWorkflowConfig from graphrag.index.context import PipelineRunContext -from graphrag.index.create_pipeline_config import create_pipeline_config from graphrag.index.run.utils import create_run_context +from graphrag.utils.storage import write_table_to_storage pd.set_option("display.max_columns", None) -def load_input_tables(inputs: list[str]) -> dict[str, pd.DataFrame]: - """Harvest all the referenced input IDs from the workflow being tested and pass them here.""" - # stick all the inputs in a map - Workflow looks them up by name - input_tables: dict[str, pd.DataFrame] = {} +async def create_test_context(storage: list[str] | None = None) -> PipelineRunContext: + """Create a test context with tables loaded into storage storage.""" + context = create_run_context(None, None, None) - source = pd.read_parquet("tests/verbs/data/source_documents.parquet") - input_tables["source"] = source + # always set the input docs + input = load_test_table("source_documents") + await write_table_to_storage(input, "input", context.storage) - for input in inputs: - # remove the workflow: prefix if it exists, because that is not part of the actual table filename - name = input.replace("workflow:", "") - input_tables[input] = pd.read_parquet(f"tests/verbs/data/{name}.parquet") + if storage: + for name in storage: + table = load_test_table(name) + # normal storage interface insists on bytes + await write_table_to_storage(table, name, context.storage) - return input_tables + return context def load_test_table(output: str) -> pd.DataFrame: @@ -37,41 +33,6 @@ def load_test_table(output: str) -> pd.DataFrame: return pd.read_parquet(f"tests/verbs/data/{output}.parquet") -def get_config_for_workflow(name: str) -> PipelineWorkflowConfig: - """Instantiates the bare minimum config to get a default workflow config for testing.""" - config = create_graphrag_config() - - # this flag needs to be set before creating the pipeline config, or the entire covariate workflow will be excluded - config.claim_extraction.enabled = True - - pipeline_config = create_pipeline_config(config) - - result = next(conf for conf in pipeline_config.workflows if conf.name == name) - - return cast("PipelineWorkflowConfig", result.config) - - -async def get_workflow_output( - input_tables: dict[str, pd.DataFrame], - schema: dict, - context: PipelineRunContext | None = None, -) -> pd.DataFrame: - """Pass in the input tables, the schema, and the output name""" - - # the bare minimum workflow is the pipeline schema and table context - workflow = Workflow( - schema=schema, - input_tables=input_tables, - ) - - run_context = context or create_run_context(None, None, None) - - await workflow.run(context=run_context) - - # if there's only one output, it is the default here, no name required - return cast("pd.DataFrame", workflow.output()) - - def compare_outputs( actual: pd.DataFrame, expected: pd.DataFrame, columns: list[str] | None = None ) -> None: