[全量] 初始化项目代码、配置、文档及Agent协同harness
This commit is contained in:
26
langchain-chat/document_loaders/myimgloader.py
Normal file
26
langchain-chat/document_loaders/myimgloader.py
Normal file
@@ -0,0 +1,26 @@
|
||||
from typing import List
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
from document_loaders.ocr import get_ocr
|
||||
|
||||
|
||||
class RapidOCRLoader(UnstructuredFileLoader):
|
||||
def _get_elements(self) -> List:
|
||||
def img2text(filepath):
|
||||
resp = ""
|
||||
ocr = get_ocr()
|
||||
result, _ = ocr(filepath)
|
||||
if result:
|
||||
ocr_result = [line[1] for line in result]
|
||||
resp += "\n".join(ocr_result)
|
||||
return resp
|
||||
|
||||
text = img2text(self.file_path)
|
||||
return text
|
||||
# from unstructured.partition.text import partition_text
|
||||
# return partition_text(text=text, **self.unstructured_kwargs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
loader = RapidOCRLoader(file_path="../tests/samples/ocr_test.jpg")
|
||||
docs = loader.load()
|
||||
print(docs)
|
||||
Reference in New Issue
Block a user