Building an AI-Driven Document Query Assistant: New Update Supports GPT-4o and Incremental Re-Indexing

Please refer the references below to get up to speed on this project:

This is a continuation update for the “Building an AI-Driven Document Query Assistant” series. In the last update, I implemented a non-OpenAI embed model, specifically the nomic-embed-text:latest using Ollama. In this update, I have added an incremental re-indexing feature and enabled users to choose from multiple GPT language models. Specifically, the script now supports gpt-3.5-turbo and gpt-4o.

You can find the public github repo for this project at: https://github.com/aarriitt666/ragai_llamaindex

Complete code:

import streamlit as st
import os
from dotenv import load_dotenv
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, load_index_from_storage, StorageContext
from llama_index.core.settings import Settings
from llama_index.core.response.pprint_utils import pprint_response
import warnings
from llama_index.core.indices.postprocessor import SentenceTransformerRerank
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
import traceback
from llama_index.embeddings.ollama import OllamaEmbedding
# Load environment variables
load_dotenv('.env')
# Suppress specific FutureWarnings from huggingface_hub
warnings.filterwarnings("ignore", category=FutureWarning, module='huggingface_hub')
class QueryBundle:
    def __init__(self, query_str):
        self.query_str = query_str
# Define paths
storage_path = './vectorstore'
documents_path = './documents'
# Model selection
model_option = st.selectbox(
    "Select LLM model",
    ("gpt-3.5-turbo", "gpt-4o")
    )
# Set the model configuration
Settings.llm = OpenAI(model=model_option, temperature=0.1)
Settings.chunk_size = 2048
Settings.chunk_overlap = 500
ollama_embedding = OllamaEmbedding(
    model_name="nomic-embed-text:latest",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0},
)
Settings.embed_model = ollama_embedding
# Ensure directories exist
if not os.path.exists(storage_path):
    os.makedirs(storage_path, exist_ok=True)
if not os.path.exists(documents_path):
    os.makedirs(documents_path, exist_ok=True)
# Initialize the reranker
reranker = SentenceTransformerRerank(model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=5)
# Initialize the parser
parser = SentenceSplitter()
def document_changes_detected(documents_path, metadata_path):
    # Load existing metadata if available
    if os.path.exists(metadata_path):
        with open(metadata_path, 'r') as f:
            indexed_files = set(json.load(f))
    else:
        indexed_files = set()
    # Get the current set of documents
    current_files = {file for file in os.listdir(documents_path) if os.path.isfile(os.path.join(documents_path, file))}
    # Detect changes
    new_files = current_files - indexed_files
    removed_files = indexed_files - current_files
    # Log changes
    # st.write(f"Current files: {current_files}")
    # st.write(f"Indexed files: {indexed_files}")
    # st.write(f"New files: {new_files}")
    # st.write(f"Removed files: {removed_files}")
    # Update metadata file if changes are detected
    if new_files or removed_files:
        with open(metadata_path, 'w') as f:
            json.dump(list(current_files), f)
    return list(new_files), list(removed_files)
def pprint_response(response, show_source=False):
    if isinstance(response, str):
        print(response)  # Handle the string directly
    else:
        if response.response is None:
            print("No response.")
        else:
            print(response.response)
            if show_source:
                print("Source:", response.source)
                
class EnhancedTextNode:
    def __init__(self, text_node):
        self.node = text_node  # Wrap the original TextNode
    def get_content(self, metadata_mode):
        return self.node.text  # Implement a method that the reranker might call
# Modify the enhance_and_rerank_responses function to wrap TextNodes
def enhance_and_rerank_responses(responses, query):
    """ Combine reranking and enhancing to select the most comprehensive and relevant response. """
    if not responses:
        return "No responses available."
    
    # Reranking using the semantic reranker
    query_bundle = QueryBundle(query)
    nodes = [EnhancedTextNode(TextNode(text=res)) for res in responses]  # Wrap TextNodes for compatibility
    reranked_nodes = reranker.postprocess_nodes(nodes=nodes, query_bundle=query_bundle)
    reranked_responses = [node.node.text for node in reranked_nodes]  # Adjust access to text
    # Enhance the response quality by selecting the most comprehensive answer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(reranked_responses)
    cosine_matrix = cosine_similarity(tfidf_matrix)
    avg_similarity = cosine_matrix.mean(axis=0)
    best_response_idx = avg_similarity.argmax()
    return reranked_responses[best_response_idx]
@st.cache_resource(show_spinner=False)
def initialize(force_reindex=False, incremental_index=False):
    metadata_path = os.path.join(storage_path, 'metadata.json')
    new_files, removed_files = document_changes_detected(documents_path, metadata_path)
    if force_reindex:
        # If force_reindex is True, load or create a new index from scratch
        documents = SimpleDirectoryReader(input_dir=documents_path).load_data()
        nodes = parser.get_nodes_from_documents(documents)
        index = VectorStoreIndex(nodes, embed_model=Settings.embed_model)
        index.storage_context.persist(persist_dir=storage_path)
        return index, "Re-indexing completed."
    else:
        # Load the existing index from storage
        storage_context = StorageContext.from_defaults(persist_dir=storage_path)
        index = load_index_from_storage(storage_context)
        
        # Handle incremental indexing
        if new_files:
            new_file_paths = [os.path.join(documents_path, file) for file in new_files]
            new_documents = SimpleDirectoryReader(input_files=new_file_paths).load_data()
            new_nodes = parser.get_nodes_from_documents(new_documents)
            index.insert_nodes(new_nodes)
            incremental_message = "Incremental re-indexing completed."
        else:
            incremental_message = "No new documents detected. Incremental re-indexing not needed."
        # Handle removed files
        if removed_files:
            for file in removed_files:
                index.docstore.delete_document(file)
        index.storage_context.persist(persist_dir=storage_path)
    return index, incremental_message
def main():
    st.title('Ask the Document')
    # Button to force re-indexing
    force_reindex = st.button("Re-index Documents")
    # Button to trigger incremental indexing
    incremental_index = st.button("Incremental Indexing")
    if force_reindex:
        st.info("Re-indexing triggered...")
    if incremental_index:
        st.info("Incremental indexing triggered...")
    if st.button('Clear Cache'):
        st.cache_data.clear()
        st.info('Cache cleared!')
    try:
        # Initialize or reinitialize index if needed
        index, message = initialize(force_reindex=force_reindex, incremental_index=incremental_index)
        st.info(f"Index initialized or loaded successfully.  {message}")
        # Check for documents and handle uploads
        if not os.listdir(documents_path):
            st.error("No documents found. Please upload your documents.")
            uploaded_files = st.file_uploader("Upload documents", accept_multiple_files=True, type=['pdf', 'txt', 'docx'])
            if uploaded_files:
                for uploaded_file in uploaded_files:
                    with open(os.path.join(documents_path, uploaded_file.name), "wb") as f:
                        f.write(uploaded_file.getvalue())
                st.experimental_rerun()  # Rerun the script after files are uploaded
        else:
            if 'messages' not in st.session_state:
                st.session_state.messages = [{'role': 'assistant', 'content': 'Ask me a question!'}]
            # Document interaction section
            chat_engine = index.as_chat_engine(chat_mode='condense_question', verbose=True)
            if prompt := st.text_input('Your question'):
                st.session_state.messages.append({'role': 'user', 'content': prompt})
            for message in st.session_state.messages:
                with st.expander(f"{message['role'].title()} says:"):
                    st.write(message['content'])
            if st.session_state.messages[-1]['role'] != 'assistant':
                with st.spinner('Thinking...'):
                    response = chat_engine.chat(prompt)
                    response_texts = response.response if isinstance(response.response, list) else [response.response]
                    st.write(response_texts)
                    best_response = enhance_and_rerank_responses(response_texts, prompt)
                    pprint_response(best_response, show_source=True)
                    st.session_state.messages.append({'role': 'assistant', 'content': best_response})
    except Exception as e:
        st.error("An error occurred during document processing or initialization.")
        st.text(f"Error: {e}")
        st.text(traceback.format_exc())  # To show full traceback in the interface
if __name__ == "__main__":
    main()


Leave a comment