About
I am going to summarize the way of registering embeddings into vector database behind Llama-index due to breaking changes in Llama-Index.
Importing Parts
from pathlib import Path
import uuid
from llama index.core.node_parser import SentenceSplitter
from llama index.core.callbacks import CallbackManager, LlamaDebugHandler
from llama index.core.indices.base import BaseIndex
from llama index.core.indices.loading import load_index_from_storage
#from llama index.core.indices.loading import load_indices_from_storage
from llama_index.llms.openai import OpenAI
from llama index.embeddings.openai import OpenAIEmbedding
from llama index.embeddings.huggingface import HuggingFaceEmbedding
from llama index.core import Settings
from llama index.core.storage.storage_context import StorageContext
from llama index.core.storage.redis import RedisDocumentStore
from llama index.storage.index_store.redis import RedisIndexStore
from llama index.vector_stores.redis import RedisVectorStore
from llama index.core import VectorStoreIndex
from llama index.core.indices import BaseIndex
from llama index.readers.file import PDFReader
from llama index.readers.file import DocxReader
from llama index.readers.file.slides import PptxReader
from llama index.readers.file.flat import FlatReader
from llama index.readers.file.markdown import MarkdownReader
from llama index.core.vector_stores import MetadataFilters, ExactMatchFilter
Connect with Vector Database thru Llama-Index
def conect(redis_host, redis_port, namespace) -> StorageContext:
index_store = RedisIndexStore.from_host_and_port(
host=redis_host,
port=redis_port,
namespace=namespace
)
doc_store = RedisDocumentStore.from_host_and_port(
host=redis_host,
port=redis_port,
namespace=namespace
)
vec_store = RedisVectorStore(
index_name=namespace
redis_url=f"redis://{redis_host}:{redis_port}",
)
return StorageContext.from_defaults(
docstore=doc_store,
index_store=index_store,
vector_store=vec_store
)
Provisioning some Settings
def get_readers():
extensions = [
".pdf", ".docx", ".pptx", ".txt", ".md"
]
readers = [
PyMuPDFReader, DocxReader, PptxReader,
FlatReader, MarkdownReader
]
ext2reader = {
ext: r
for ext, r in zip(extensions, readers)
}
return ext2reader
def do_setting(
embedding: str
):
if embedding in ['text-embedding-3-small', 'text-embedding-ada-002']:
Settings.embed_model = HuggingFaceEmbedding(
model_name=embedding
)
else:
Settings.embed_model = OpenAIEmbeddingEmbedding(
model_name=embedding
)
Settings.llm = OpenAI(model='gpt-4', temperature=0.3)
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
llama_debug_handler = LlamaDebugHandler()
Settings.callback_manager = CallbackManager([llama_debug_handler])
collection_id = "2b05fc9f-f09b-49b8-b2f3-ffbf28155e52"
storage_context = my_storage_context(
hostname, portname, app_name
)
Embedding
if __name__ == '__main__':
import glob
import argparse
parser = argparse.ArgumentParser(
description='What this program is going to do.'
)
parser.add_argument(
'--files_path', '-FP', type=str, default='./files', help=''
)
parser.add_argument(
'--index_name', '-IN', type=str, default='myindex', help=''
)
parser.add_argument(
'--embedding', '-EE', type=str, default='text-embedding-3-small', help=''
)
parser.add_argument(
'--redis_host', '-RH', type=str, default='localhost', help=''
)
parser.add_argument(
'--redis_port', '-RP', type=str, default='6739', help=''
)
parser.add_argument(
'--namespace', '- NS', type=str, default='myspace', help=''
)
args = parser.parse_args()
do_setting(args.embedding)
ext2reader = get_readers()
path_list = glob.glob(f"{args.files_path}/*")
for dpath in path_list:
dp = Path(dpath)
if dp.suffix not in default_extensions:
continue
reader = ext2reader[dp.suffix]
documents.extend(
reader().load_data(
dp,
extra_info_dict=dict(
file_id=str(uuid.uuid4()),
file_name=dp.name,
)
)
)
nodes = Settings.node_parser.get_nodes_from_documents(
documents
)
print("documents", len(documents))
print("nodes", len(nodes))
index = VectorStoreIndex(
nodes,
storage_context=conect(
args.redis_host, args.redis_port, args.namespace
),
store_nodes_override=True
)
index.insert_nodes(nodes)
index.set_index_id(args.index_name)