rag-eval/helpers.py at main · r-kowalczyk/rag-eval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/usr/bin/env python  # noqa: EXE001

### Evaluation Functions ###

## Imports ##
import re
from pathlib import Path
from typing import Any

import pandas as pd
from langchain_community.vectorstores import Chroma
from mlflow.entities.experiment import Experiment
from mlflow.metrics.genai import EvaluationExample

from fai.run import LlmMethods, LlmMethodsConfig
from fai.store import StoreVectors, StoreVectorsConfig

####################

LLM_LOCATION = Path("backend/models/llm/")


# Returns evaluation questions (either demo or main set from csv)
def get_eval_questions(
    set_type: str = "main_demo",
    file_path: str = "questions/",
) -> pd.DataFrame:
    """Retrieve evaluation questions from a CSV file.

    Args:
        set_type (str, optional): Type of questions to retrieve. Defaults to "main_demo".
        file_path (str, optional): Path to the directory containing the CSV file. Defaults to "questions/".

    Returns:
        pd.DataFrame: DataFrame containing the evaluation questions.
    """
    if set_type == "main_demo":
        file_name = "main_questions_demo.csv"
    elif set_type == "main_full":
        file_name = "main_questions_full.csv"
    elif set_type == "negative_control_demo":
        file_name = "neg_ctrl_questions_demo.csv"
    elif set_type == "negative_control_full":
        file_name = "neg_ctrl_questions_full.csv"

    return pd.read_csv(file_path + file_name, usecols=["question"]).rename(
        columns={"question": "questions"}
    )


# # Parses the evaluation examples from a .txt file into a dictionary
def parse_text_to_dict(file_path: Path) -> dict[str, str]:
    """Parses the content of a text file into a dictionary.

    Args:
        file_path (str): The path to the text file.

    Returns:
        dict: A dictionary containing the parsed content.

    """
    # Read the content from the file using Path.open()
    with Path(file_path).open() as file:
        text_content = file.read()

    # Splitting the text content into lines
    lines = text_content.strip().split("\n")

    # Parsing the lines into a dictionary
    parsed_dict = {}
    key = None

    # Tuple of prefixes to check in startswith
    prefixes = ("input_", "grading_context_", "output_", "score_", "justification_")

    for line in lines:
        if not line.strip():
            continue

        # Check if the line is a key, using a tuple in startswith
        if line.startswith(prefixes):
            key = line.strip()
            parsed_dict[key] = ""
        elif key:
            # Append line to the current key
            parsed_dict[key] += line.strip() + " "

    return parsed_dict


# # Returns evaluation metric examples for a given metric name
def get_eval_metric_examples(metric_name: str, start_index: int = 1) -> list:
    """Retrieves evaluation metric examples for a given metric name.

    Parameters:
        metric_name (str): The name of the metric.
        start_index (int): starting index for the examples. Default is 1.


    Returns:
        list: A list of EvaluationExample objects representing the evaluation metric examples.

    Raises:
        FileNotFoundError: If the grading criteria file for the given metric name is not found.
    """
    file_path = Path(f"grading_criteria/{metric_name}.txt")

    # Count the number of examples in the file
    with file_path.open() as file:
        examples = file.read()
    examples_n = len(re.findall(r"input_\d+", examples)) + start_index

    # Parse the file into a dictionary
    # Assuming parse_text_to_dict is a predefined function elsewhere in your code
    parsed_dict = parse_text_to_dict(file_path)

    return [
        EvaluationExample(
            input=parsed_dict[f"input_{i}"],
            output=parsed_dict[f"output_{i}"],
            score=parsed_dict[f"score_{i}"],
            justification=parsed_dict[f"justification_{i}"],
            grading_context={"context": parsed_dict[f"grading_context_{i}"]},
        )
        for i in range(start_index, examples_n)
    ]


# # grabs file paths from a text file and returns a dictionary
def parse_file_paths(
    file_path: str, line_step: int = 3, next_line_offset: int = 3
) -> dict[str, str]:
    """Parses a text file containing file names and their paths into a dictionary.

    Each file name is followed by its path, separated by a new line.

    Args:
        file_path (str): The path to the text file.
        line_step (int): The number of lines to step over between each file name and path. Default is 3.
        next_line_offset (int): The number of lines to skip after each file name to reach the corresponding path. Default is 3.

    Returns:
        dict[str, str]: A dictionary where keys are file names and values are paths.
    """
    paths_dict = {}
    path = Path(file_path)

    # Using Path.open() for opening the file
    with path.open() as file:
        lines = file.readlines()

    # Iterating through lines and extracting file names and paths
    for i in range(0, len(lines), line_step):  # Step by 3 to skip the empty lines
        file_name = lines[i].strip()  # Removing any trailing newline or spaces
        file_path = lines[
            i + next_line_offset
        ].strip()  # Assuming the next line is the file path
        paths_dict[file_name] = file_path

    return paths_dict


# # Return Mlflow experiment details
def return_experiment_details(experiment: Experiment) -> list[str]:
    """Returns the details of an Mlflow experiment.

    Parameters:
    experiment (str): The name of the experiment.

    Returns:
    list[str]: A list of experiment details including name, experiment ID, artifact location, and creation timestamp.
    """
    return [
        f"Name: {experiment.name}",
        f"Experiment_id: {experiment.experiment_id}",
        f"Artifact Location: {experiment.artifact_location}",
        f"Creation timestamp: {experiment.creation_time}",
    ]


# # Wrapper function for the StoreVectors class
def split_and_create_vectors(
    chunk_size: int,
    chunk_overlap: int,
    embedding_model: str,
) -> Chroma:
    """Split the text documents in the given directory into chunks and create vectors for each chunk.

    Args:
        chunk_size (int): The size of each chunk. Default is 1000.
        chunk_overlap (int): The overlap between consecutive chunks. Default is 50.
        embedding_model (str): The name of the embedding model to use. Default is "all-mpnet-base-v2".
        chroma_client (Path): The path to the chroma client. Default is "../chromaDB".

    Returns:
        Chroma: The Chroma object containing the vectors for each chunk.
    """
    config = StoreVectorsConfig(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        embedding_model=embedding_model,
    )

    store_vectors = StoreVectors(config=config)
    store_vectors.load_and_split_pdfs()
    store_vectors.initiate_embedding_model()
    store_vectors.drop_redundant_docs()
    return store_vectors.embed_and_store_in_chroma(doc_type="pdf")


# # Wrapper function for the LlmMethods class
def create_llm_pipe(
    llm_path: Path,
    embedding_model: str,
    temperature: float,
) -> Any:
    """Create a pipeline for Language Model (LLM) using the specified parameters.

    Args:
        llm_path (str): The path to the LLM.
        embedding_model (str): The embedding model to be used by the LLM.
        temperature (float): The temperature parameter for LLM generation.

    Returns:
        Any: The retrieval question-answering pipeline.
    """
    config = LlmMethodsConfig(
        embedding_model=embedding_model,
        llm_generate_temp=temperature,
        model_path=llm_path,
    )
    llm_methods = LlmMethods(config=config)

    return llm_methods.retrieval_qa_eval()