-
Notifications
You must be signed in to change notification settings - Fork 48
/
Copy pathfunctions.py
106 lines (94 loc) · 3.01 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import streamlit as st
from zipfile import ZipFile
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import base64
#------- OCR ------------
import pdf2image
import pytesseract
from pytesseract import Output, TesseractError
@st.cache_data
def images_to_txt(path, language):
images = pdf2image.convert_from_bytes(path)
all_text = []
for i in images:
pil_im = i
text = pytesseract.image_to_string(pil_im, lang=language)
# ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT)
# ocr_dict now holds all the OCR info including text and location on the image
# text = " ".join(ocr_dict['text'])
# text = re.sub('[ ]{2,}', '\n', text)
all_text.append(text)
return all_text, len(all_text)
@st.cache_data
def convert_pdf_to_txt_pages(path):
texts = []
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
# fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
size = 0
c = 0
file_pages = PDFPage.get_pages(path)
nbPages = len(list(file_pages))
for page in PDFPage.get_pages(path):
interpreter.process_page(page)
t = retstr.getvalue()
if c == 0:
texts.append(t)
else:
texts.append(t[size:])
c = c+1
size = len(t)
# text = retstr.getvalue()
# fp.close()
device.close()
retstr.close()
return texts, nbPages
@st.cache_data
def convert_pdf_to_txt_file(path):
texts = []
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
# fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
file_pages = PDFPage.get_pages(path)
nbPages = len(list(file_pages))
for page in PDFPage.get_pages(path):
interpreter.process_page(page)
t = retstr.getvalue()
# text = retstr.getvalue()
# fp.close()
device.close()
retstr.close()
return t, nbPages
@st.cache_data
def save_pages(pages):
files = []
for page in range(len(pages)):
filename = "page_"+str(page)+".txt"
with open("./file_pages/"+filename, 'w', encoding="utf-8") as file:
file.write(pages[page])
files.append(file.name)
# create zipfile object
zipPath = './file_pages/pdf_to_txt.zip'
zipObj = ZipFile(zipPath, 'w')
for f in files:
zipObj.write(f)
zipObj.close()
return zipPath
def displayPDF(file):
# Opening file from file path
# with open(file, "rb") as f:
base64_pdf = base64.b64encode(file).decode('utf-8')
# Embedding PDF in HTML
pdf_display = F'<iframe src="data:application/pdf;base64,{base64_pdf}" width="700" height="1000" type="application/pdf"></iframe>'
# Displaying File
st.markdown(pdf_display, unsafe_allow_html=True)