Building an AI-Driven Document Query Assistant: New Update Supports GPT-4o and Incremental Re-Indexing

September 2, 2024

Building an AI-Driven Document Query Assistant: New Update Supports GPT-4o and Incremental Re-Indexing

Please refer the references below to get up to speed on this project:

Building an AI-Driven Document Query Assistant: A Step-by-Step Python Tutorial
Building an AI-Driven Document Query Assistant: Using Non-OpenAI’s Embed Model

This is a continuation update for the “Building an AI-Driven Document Query Assistant” series. In the last update, I implemented a non-OpenAI embed model, specifically the nomic-embed-text:latest using Ollama. In this update, I have added an incremental re-indexing feature and enabled users to choose from multiple GPT language models. Specifically, the script now supports gpt-3.5-turbo and gpt-4o.

You can find the public github repo for this project at: https://github.com/aarriitt666/ragai_llamaindex

Complete code:

import streamlit as st
import os
from dotenv import load_dotenv
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, load_index_from_storage, StorageContext
from llama_index.core.settings import Settings
from llama_index.core.response.pprint_utils import pprint_response
import warnings
from llama_index.core.indices.postprocessor import SentenceTransformerRerank
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
import traceback
from llama_index.embeddings.ollama import OllamaEmbedding
# Load environment variables
load_dotenv('.env')
# Suppress specific FutureWarnings from huggingface_hub
warnings.filterwarnings("ignore", category=FutureWarning, module='huggingface_hub')
class QueryBundle:
    def __init__(self, query_str):
        self.query_str = query_str
# Define paths
storage_path = './vectorstore'
documents_path = './documents'
# Model selection
model_option = st.selectbox(
    "Select LLM model",
    ("gpt-3.5-turbo", "gpt-4o")
    )
# Set the model configuration
Settings.llm = OpenAI(model=model_option, temperature=0.1)
Settings.chunk_size = 2048
Settings.chunk_overlap = 500
ollama_embedding = OllamaEmbedding(
    model_name="nomic-embed-text:latest",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0},
)
Settings.embed_model = ollama_embedding
# Ensure directories exist
if not os.path.exists(storage_path):
    os.makedirs(storage_path, exist_ok=True)
if not os.path.exists(documents_path):
    os.makedirs(documents_path, exist_ok=True)
# Initialize the reranker
reranker = SentenceTransformerRerank(model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=5)
# Initialize the parser
parser = SentenceSplitter()
def document_changes_detected(documents_path, metadata_path):
    # Load existing metadata if available
    if os.path.exists(metadata_path):
        with open(metadata_path, 'r') as f:
            indexed_files = set(json.load(f))
    else:
        indexed_files = set()
    # Get the current set of documents
    current_files = {file for file in os.listdir(documents_path) if os.path.isfile(os.path.join(documents_path, file))}
    # Detect changes
    new_files = current_files - indexed_files
    removed_files = indexed_files - current_files
    # Log changes
    # st.write(f"Current files: {current_files}")
    # st.write(f"Indexed files: {indexed_files}")
    # st.write(f"New files: {new_files}")
    # st.write(f"Removed files: {removed_files}")
    # Update metadata file if changes are detected
    if new_files or removed_files:
        with open(metadata_path, 'w') as f:
            json.dump(list(current_files), f)
    return list(new_files), list(removed_files)
def pprint_response(response, show_source=False):
    if isinstance(response, str):
        print(response)  # Handle the string directly
    else:
        if response.response is None:
            print("No response.")
        else:
            print(response.response)
            if show_source:
                print("Source:", response.source)
                
class EnhancedTextNode:
    def __init__(self, text_node):
        self.node = text_node  # Wrap the original TextNode
    def get_content(self, metadata_mode):
        return self.node.text  # Implement a method that the reranker might call
# Modify the enhance_and_rerank_responses function to wrap TextNodes
def enhance_and_rerank_responses(responses, query):
    """ Combine reranking and enhancing to select the most comprehensive and relevant response. """
    if not responses:
        return "No responses available."
    
    # Reranking using the semantic reranker
    query_bundle = QueryBundle(query)
    nodes = [EnhancedTextNode(TextNode(text=res)) for res in responses]  # Wrap TextNodes for compatibility
    reranked_nodes = reranker.postprocess_nodes(nodes=nodes, query_bundle=query_bundle)
    reranked_responses = [node.node.text for node in reranked_nodes]  # Adjust access to text
    # Enhance the response quality by selecting the most comprehensive answer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(reranked_responses)
    cosine_matrix = cosine_similarity(tfidf_matrix)
    avg_similarity = cosine_matrix.mean(axis=0)
    best_response_idx = avg_similarity.argmax()
    return reranked_responses[best_response_idx]
@st.cache_resource(show_spinner=False)
def initialize(force_reindex=False, incremental_index=False):
    metadata_path = os.path.join(storage_path, 'metadata.json')
    new_files, removed_files = document_changes_detected(documents_path, metadata_path)
    if force_reindex:
        # If force_reindex is True, load or create a new index from scratch
        documents = SimpleDirectoryReader(input_dir=documents_path).load_data()
        nodes = parser.get_nodes_from_documents(documents)
        index = VectorStoreIndex(nodes, embed_model=Settings.embed_model)
        index.storage_context.persist(persist_dir=storage_path)
        return index, "Re-indexing completed."
    else:
        # Load the existing index from storage
        storage_context = StorageContext.from_defaults(persist_dir=storage_path)
        index = load_index_from_storage(storage_context)
        
        # Handle incremental indexing
        if new_files:
            new_file_paths = [os.path.join(documents_path, file) for file in new_files]
            new_documents = SimpleDirectoryReader(input_files=new_file_paths).load_data()
            new_nodes = parser.get_nodes_from_documents(new_documents)
            index.insert_nodes(new_nodes)
            incremental_message = "Incremental re-indexing completed."
        else:
            incremental_message = "No new documents detected. Incremental re-indexing not needed."
        # Handle removed files
        if removed_files:
            for file in removed_files:
                index.docstore.delete_document(file)
        index.storage_context.persist(persist_dir=storage_path)
    return index, incremental_message
def main():
    st.title('Ask the Document')
    # Button to force re-indexing
    force_reindex = st.button("Re-index Documents")
    # Button to trigger incremental indexing
    incremental_index = st.button("Incremental Indexing")
    if force_reindex:
        st.info("Re-indexing triggered...")
    if incremental_index:
        st.info("Incremental indexing triggered...")
    if st.button('Clear Cache'):
        st.cache_data.clear()
        st.info('Cache cleared!')
    try:
        # Initialize or reinitialize index if needed
        index, message = initialize(force_reindex=force_reindex, incremental_index=incremental_index)
        st.info(f"Index initialized or loaded successfully.  {message}")
        # Check for documents and handle uploads
        if not os.listdir(documents_path):
            st.error("No documents found. Please upload your documents.")
            uploaded_files = st.file_uploader("Upload documents", accept_multiple_files=True, type=['pdf', 'txt', 'docx'])
            if uploaded_files:
                for uploaded_file in uploaded_files:
                    with open(os.path.join(documents_path, uploaded_file.name), "wb") as f:
                        f.write(uploaded_file.getvalue())
                st.experimental_rerun()  # Rerun the script after files are uploaded
        else:
            if 'messages' not in st.session_state:
                st.session_state.messages = [{'role': 'assistant', 'content': 'Ask me a question!'}]
            # Document interaction section
            chat_engine = index.as_chat_engine(chat_mode='condense_question', verbose=True)
            if prompt := st.text_input('Your question'):
                st.session_state.messages.append({'role': 'user', 'content': prompt})
            for message in st.session_state.messages:
                with st.expander(f"{message['role'].title()} says:"):
                    st.write(message['content'])
            if st.session_state.messages[-1]['role'] != 'assistant':
                with st.spinner('Thinking...'):
                    response = chat_engine.chat(prompt)
                    response_texts = response.response if isinstance(response.response, list) else [response.response]
                    st.write(response_texts)
                    best_response = enhance_and_rerank_responses(response_texts, prompt)
                    pprint_response(best_response, show_source=True)
                    st.session_state.messages.append({'role': 'assistant', 'content': best_response})
    except Exception as e:
        st.error("An error occurred during document processing or initialization.")
        st.text(f"Error: {e}")
        st.text(traceback.format_exc())  # To show full traceback in the interface
if __name__ == "__main__":
    main()

technology

ai, documents, gpt-3-5-turbo, gpt-4o, query-assistant, rag, retrieval-augmented-generation-ai

Posted by:

Vinh Nguyen

Just an average Joe. I run a little blog known as EssayBoard.com. If you have time, check it out. Thanks.

About Me

Hello and welcome! I’m Vinh Nguyen, the voice and mind behind this blog. Here, we harness the power of AI to generate insightful, engaging content, all with a touch of human curation and expertise. My passion lies in blending technology with creativity, crafting posts that resonate with both heart and mind. Whether it’s a deep dive into tech trends, culinary adventures, or thought-provoking discussions, every piece is carefully curated to provide value and inspiration. Join me on this exciting journey where AI and human insight come together to create something truly special.

Building an AI-Driven Document Query Assistant: New Update Supports GPT-4o and Incremental Re-Indexing

Share this:

Leave a comment Cancel reply