-
Notifications
You must be signed in to change notification settings - Fork 48
/
Copy pathapp.py
160 lines (145 loc) · 5.01 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import streamlit as st
import pdf2image
from PIL import Image
import pytesseract
from pytesseract import Output, TesseractError
from functions import convert_pdf_to_txt_pages, convert_pdf_to_txt_file, save_pages, displayPDF, images_to_txt
st.set_page_config(page_title="PDF to Text")
html_temp = """
<div style="background-color:{};padding:1px">
</div>
"""
# st.markdown("""
# ## :outbox_tray: Text data extractor: PDF to Text
# """)
# st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
st.markdown("""
## Text data extractor: PDF to Text
""")
languages = {
'English': 'eng',
'French': 'fra',
'Arabic': 'ara',
'Spanish': 'spa',
}
with st.sidebar:
st.title(":outbox_tray: PDF to Text")
textOutput = st.selectbox(
"How do you want your output text?",
('One text file (.txt)', 'Text file per page (ZIP)'))
ocr_box = st.checkbox('Enable OCR (scanned document)')
st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
st.markdown("""
# How does it work?
Simply load your PDF and convert it to single-page or multi-page text.
""")
st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
st.markdown("""
Made by [@nainia_ayoub](https://twitter.com/nainia_ayoub)
""")
st.markdown(
"""
<a href="https://www.buymeacoffee.com/nainiayoub" target="_blank">
<img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="41" width="174">
</a>
""",
unsafe_allow_html=True,
)
pdf_file = st.file_uploader("Load your PDF", type=['pdf', 'png', 'jpg'])
hide="""
<style>
footer{
visibility: hidden;
position: relative;
}
.viewerBadge_container__1QSob{
visibility: hidden;
}
#MainMenu{
visibility: hidden;
}
<style>
"""
st.markdown(hide, unsafe_allow_html=True)
if pdf_file:
path = pdf_file.read()
file_extension = pdf_file.name.split(".")[-1]
if file_extension == "pdf":
# display document
with st.expander("Display document"):
displayPDF(path)
if ocr_box:
option = st.selectbox('Select the document language', list(languages.keys()))
# pdf to text
if textOutput == 'One text file (.txt)':
if ocr_box:
texts, nbPages = images_to_txt(path, languages[option])
totalPages = "Pages: "+str(nbPages)+" in total"
text_data_f = "\n\n".join(texts)
else:
text_data_f, nbPages = convert_pdf_to_txt_file(pdf_file)
totalPages = "Pages: "+str(nbPages)+" in total"
st.info(totalPages)
st.download_button("Download txt file", text_data_f)
else:
if ocr_box:
text_data, nbPages = images_to_txt(path, languages[option])
totalPages = "Pages: "+str(nbPages)+" in total"
else:
text_data, nbPages = convert_pdf_to_txt_pages(pdf_file)
totalPages = "Pages: "+str(nbPages)+" in total"
st.info(totalPages)
zipPath = save_pages(text_data)
# download text data
with open(zipPath, "rb") as fp:
btn = st.download_button(
label="Download ZIP (txt)",
data=fp,
file_name="pdf_to_txt.zip",
mime="application/zip"
)
else:
option = st.selectbox("What's the language of the text in the image?", list(languages.keys()))
pil_image = Image.open(pdf_file)
text = pytesseract.image_to_string(pil_image, lang=languages[option])
col1, col2 = st.columns(2)
with col1:
with st.expander("Display Image"):
st.image(pdf_file)
with col2:
with st.expander("Display Text"):
st.info(text)
st.download_button("Download txt file", text)
st.markdown('''
<a target="_blank" style="color: black" href="https://twitter.com/intent/tweet?text=You%20can%20extract%20text%20from%20your%20PDF%20using%20this%20PDF%20to%20Text%20streamlit%20app%20by%20@nainia_ayoub!%0A%0Ahttps://nainiayoub-pdf-text-data-extractor-app-p6hy0z.streamlit.app/">
<button class="btn">
Spread the word!
</button>
</a>
<style>
.btn{
display: inline-flex;
-moz-box-align: center;
align-items: center;
-moz-box-pack: center;
justify-content: center;
font-weight: 400;
padding: 0.25rem 0.75rem;
border-radius: 0.25rem;
margin: 0px;
line-height: 1.6;
color: rgb(49, 51, 63);
background-color: #fff;
width: auto;
user-select: none;
border: 1px solid rgba(49, 51, 63, 0.2);
}
.btn:hover{
color: #00acee;
background-color: #fff;
border: 1px solid #00acee;
}
</style>
''',
unsafe_allow_html=True
)