Source code for rag_skeleton.data_processing

# src/rag_skeleton/data_processing.py
import os
import gc
import torch
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma


[docs]
class DataProcessor:
    """
    Handles loading, processing, and creating vector databases for documents.
    """

    def __init__(self, vectordb_path, data_path="data/raw", embedding_model="Alibaba-NLP/gte-large-en-v1.5"):
        """
        Initialize the DataProcessor with default values. 

        Parameters:
        - data_path: str, path to the directory containing raw PDF files.
                     Default is "data/raw".

        - vectordb_path: str, path to the directory where the vector database will be stored.
                         Default is "vectordb".
                         
        - embedding_model: str, the embedding model to be used for vectorization.
                           Default is "Alibaba-NLP/gte-large-en-v1.5".

        Note:
        These are the default values. We suggest models from the MTEB leaderboard
        (https://huggingface.co/spaces/mteb/leaderboard) based on the `Retrieval Average` score
        and `Memory Usage`. Balancing retrieval quality and available resources is recommended
        to optimize both accuracy and efficiency in your specific environment.
        """
        self.data_path = data_path
        self.vectordb_path = vectordb_path
        self.embedding = HuggingFaceEmbeddings(model_name=embedding_model, model_kwargs={"trust_remote_code":True})   # https://github.com/langchain-ai/langchain/issues/6080#issuecomment-1963311548
        self.vector_store = None


[docs]
    def load_documents(self, enrich_metadata=False):
        """
        Loads PDF documents from the specified data path and optionally enriches metadata.

        Parameters:
        - enrich_metadata (bool): If True, add metadata to each document (e.g., name and year).

        Returns:
        - list: List of loaded documents with optional metadata.
        """
        docs = []
        for file in os.listdir(self.data_path):
            if file.endswith(".pdf"):
                file_path = os.path.join(self.data_path, file)
                loader = PyMuPDFLoader(file_path)
                loaded_docs = loader.load()
                
                # Enrich metadata if the flag is set
                if enrich_metadata:
                    for doc in loaded_docs:
                        doc.metadata["name"] = os.path.splitext(file)[0]  # Get file name without extension
                        doc.metadata["year"] = 2024  # Set year as 2024 for now

                docs.extend(loaded_docs)

        print(f"Loaded {len(docs)} documents.")
        return docs



[docs]
    def split_documents(self, docs, chunk_size=1500, chunk_overlap=100):
        """
        Splits documents into chunks for vectorization.

        Parameters:
        - docs: list, documents to split.

        - chunk_size: int, size of each chunk. Default is 1500.

        - chunk_overlap: int, overlap between chunks. Default is 100.

        Returns:
        - list: List of document chunks.
        """
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        return splitter.split_documents(docs)



[docs]
    def create_vector_db(self, docs):
        """
        Creates and stores the vector database in ChromaDB.

        Parameters:
        - docs: list, document chunks to vectorize and store.
        """
        if not os.path.exists(self.vectordb_path):
            os.makedirs(self.vectordb_path)
        
        self.vector_store = Chroma.from_documents(
            documents=docs,
            embedding=self.embedding,
            persist_directory=self.vectordb_path
        )

        print(f"Knowledge base created and saved in directory: {self.vectordb_path}")



[docs]
    def process_and_create_db(self):
        """Main method to load, split, and create vectorDB."""
        docs = self.load_documents()
        splits = self.split_documents(docs)
        self.create_vector_db(splits)