def file2docs(
file_path: str
) -> list[WikiDict]:
result: list[WikiDict] = []
with open(file_path, encoding="utf-8") as f:
for line in f:
if line == "\n":
continue
j: WikiDict = json.loads(line)
result += [j]
return result
documents: list[dict] = []
for dirs, subdirs, files in os.walk(f"{args.path_export}/"):
logger.info(f'{dirs}, {subdirs}, {files}')
for file in files:
file_path = f"{dirs}/{file}"
# logger.info(f"Processing...{file_path}")
docs: list[documents] = file2docs(
file_path=file_path
)
documents += docs
doclist = list()
for doc in documents[0:500]:
embedding = np.random.rand(512).tolist()
# print(doc.keys())
doclist.append(
[doc['id'], doc['title'], doc['url'], doc['text'], embedding]
)
df = pd.DataFrame(
doclist,
columns =["id", "title", "url", "text", "embedding"]
)
df.to_csv('./dataset.csv')