October 19, 2024
def file2docs(
    file_path: str
) -> list[WikiDict]:
    result: list[WikiDict] = []
    with open(file_path, encoding="utf-8") as f:
        for line in f:
            if line == "\n":
                continue
            j: WikiDict = json.loads(line)
            result += [j]
    return result


documents: list[dict] = []
for dirs, subdirs, files in os.walk(f"{args.path_export}/"):
    logger.info(f'{dirs}, {subdirs}, {files}')

        
    for file in files:
        file_path = f"{dirs}/{file}"
        # logger.info(f"Processing...{file_path}")

        docs: list[documents] = file2docs(
            file_path=file_path
        )
        documents += docs

doclist = list()
for doc in documents[0:500]:
    embedding = np.random.rand(512).tolist()
    # print(doc.keys())
    doclist.append(
        [doc['id'], doc['title'], doc['url'], doc['text'], embedding]
    )


df = pd.DataFrame(
    doclist,
    columns =["id", "title", "url", "text", "embedding"]
)
df.to_csv('./dataset.csv')