WIP: RAG setup example (#1431)

devdattakulkarni · web-flow · commit 5d1a577f4b4a · 2025-03-21T06:37:53.000-05:00
diff --git a/examples/multitenancy/rag/embedding-model-dpr.py b/examples/multitenancy/rag/embedding-model-dpr.py
@@ -0,0 +1,23 @@
+# Source ChatGPT
+
+from transformers import DPRContextEncoder, DPRTokenizer
+
+# Load the pretrained DPR model
+tokenizer = DPRTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
+encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
+
+# Example documents
+documents = [
+    "The capital of France is Paris.",
+    "Python is a programming language.",
+    "Hugging Face is a popular platform for NLP models.",
+    "The Eiffel Tower is located in Paris."
+]
+
+# Tokenize and encode documents
+encoded_inputs = tokenizer(documents, padding=True, truncation=True, return_tensors="pt")
+document_embeddings = encoder(**encoded_inputs).pooler_output
+
+# Print embeddings
+print(document_embeddings)
+
diff --git a/examples/multitenancy/rag/embedding-model.py b/examples/multitenancy/rag/embedding-model.py
@@ -0,0 +1,21 @@
+# Source ChatGPT
+
+from sentence_transformers import SentenceTransformer
+
+# Load the pre-trained model (Sentence-BERT)
+model = SentenceTransformer('all-MiniLM-L6-v2')  # or use a model like DPR for better retrieval
+
+# Example documents
+documents = [
+    "The capital of France is Paris.",
+    "Python is a programming language.",
+    "Hugging Face is a popular platform for NLP models.",
+    "The Eiffel Tower is located in Paris."
+]
+
+# Convert documents to embeddings
+document_embeddings = model.encode(documents)
+
+# Print the embeddings (this will be a list of vectors)
+print(document_embeddings)
+
diff --git a/examples/multitenancy/rag/rag-integration.py b/examples/multitenancy/rag/rag-integration.py
@@ -0,0 +1,32 @@
+# Source ChatGPT
+
+from transformers import RagTokenizer, RagSequenceForGeneration
+
+# Initialize the tokenizer, retriever, and RAG model
+tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
+retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq")
+model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq")
+
+# Example query
+query = "Where is the Eiffel Tower located?"
+
+# Tokenize the input query
+inputs = tokenizer(query, return_tensors="pt")
+
+# Retrieve relevant documents from your FAISS index
+query_embedding = model.encode([query])  # Use the same model for the query as before
+D, I = index.search(np.array(query_embedding).astype('float32'), k=5)
+
+# Use the indices to get the top-k relevant documents
+retrieved_docs = [documents[i] for i in I[0]]
+
+# Convert the retrieved documents into the appropriate format for the RAG model
+retrieved_docs_input = tokenizer(retrieved_docs, padding=True, truncation=True, return_tensors="pt")
+
+# Generate the response using RAG with the retrieved context
+generated_output = model.generate(input_ids=inputs["input_ids"], context_input_ids=retrieved_docs_input["input_ids"])
+
+# Decode and print the response
+answer = tokenizer.decode(generated_output[0], skip_special_tokens=True)
+print(answer)
+
diff --git a/examples/multitenancy/rag/search-documents.py b/examples/multitenancy/rag/search-documents.py
@@ -0,0 +1,18 @@
+# Source ChatGPT
+
+# Let's say you have a new query to retrieve similar documents
+query = "Where is the Eiffel Tower?"
+query_embedding = model.encode([query])  # Assuming you're using Sentence-BERT
+
+# Perform the search for the top-k most similar documents
+k = 2  # Number of similar documents to retrieve
+D, I = index.search(np.array(query_embedding).astype('float32'), k)
+
+# D contains the distances (lower is more similar), and I contains the indices of the retrieved documents
+print(f"Distances: {D}")
+print(f"Indices of retrieved documents: {I}")
+
+# Retrieve the documents based on the indices
+retrieved_documents = [documents[i] for i in I[0]]
+print(f"Retrieved documents: {retrieved_documents}")
+
diff --git a/examples/multitenancy/rag/steps.txt b/examples/multitenancy/rag/steps.txt
@@ -0,0 +1,37 @@
+RAG Architecture
+-----------------
+RAG stands for Retrieval Augmented Generation.
+
+In Generative AI tasks like Question-Answering, RAG setups
+retrieve documents that are relevant to the query, and use them
+as context when generating the answer.
+
+The steps involved in a RAG setup are:
+1. Choosing an embedding model (embedding-model-dpr.py)
+2. Store document embeddings in a Vector database (store-documentembeddings-vectordb.py)
+3. Search documents relevant to a query (search-documents.py)
+4. Integrate with RAG setup (rag-integration.py)
+
+Consider an enterprise who wants to deploy chatbots for answering questions related to different
+aspects of the company, such as HR questions, internal coding standards used within the company, and social events.
+Imagine that information about these is available on various internal tools such as Workday,
+JIRA/Confluence, internal social media apps like Signal, etc.
+The chatbots need to use all this information, distributed in different document repositories,
+when answering questions.
+
+The architecture for such a system can look as follows:
+We create one Vector database which is loaded with all the documents from different internal document sources
+(Workday, Confluence, Signal, etc.).
+We create different chatbots for different tasks (e.g.: askhr - for HR question/answers, 
+codr - for coding question/answers , tgif - for social events). Each chatbot can have special prompts
+relevant to their task. All the chatbots use the same Vector database as part of their RAG setup.
+A single Vector database shared by all the chatbots ensures that a single team can work on setting up
+this database. Also, a single database can correctly answer questions that span multiple categories
+(e.g.: What are the coding standards used during internal Hackthons? Here the question spans two categories - 
+coding standards and social events.) 
+
+We will have one Helm chart representing the Vector database, and separate Helm charts
+for the chatbots. We will deploy one instance of the Vector database CRD, which will run in its own namespace.
+Each chatbot will also be deployed as a single instance, and will run in its own namespace.
+In order to allow chatbots to use the Vector database in the RAG process, we will need to enable cross-namespace
+communication between the chatbot's namespace and the Vector database namespace.
diff --git a/examples/multitenancy/rag/store-documentembeddings-vectordb.py b/examples/multitenancy/rag/store-documentembeddings-vectordb.py
@@ -0,0 +1,16 @@
+# Source ChatGPT
+
+import faiss
+import numpy as np
+
+# Convert the document embeddings to numpy arrays (FAISS requires numpy arrays)
+document_embeddings = np.array(document_embeddings).astype('float32')
+
+# Create a FAISS index for similarity search
+index = faiss.IndexFlatL2(document_embeddings.shape[1])  # Using L2 distance (Euclidean)
+
+# Add the document embeddings to the index
+index.add(document_embeddings)
+
+# Now, the index can be used to search for similar documents
+