Tôi đang thêm mã để thực hiện điều này: nó đang hoạt động tốt cho tôi:
# This works in python 3
# required python packages
# tabula-py==1.0.0
# PyPDF2==1.26.0
# Pillow==4.0.0
# pdfminer.six==20170720
import os
import shutil
import warnings
from io import StringIO
import requests
import tabula
from PIL import Image
from PyPDF2 import PdfFileWriter, PdfFileReader
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
warnings.filterwarnings["ignore"]
def download_file[url]:
local_filename = url.split['/'][-1]
local_filename = local_filename.replace["%20", "_"]
r = requests.get[url, stream=True]
print[r]
with open[local_filename, 'wb'] as f:
shutil.copyfileobj[r.raw, f]
return local_filename
class PDFExtractor[]:
def __init__[self, url]:
self.url = url
# Downloading File in local
def break_pdf[self, filename, start_page=-1, end_page=-1]:
pdf_reader = PdfFileReader[open[filename, "rb"]]
# Reading each pdf one by one
total_pages = pdf_reader.numPages
if start_page == -1:
start_page = 0
elif start_page < 1 or start_page > total_pages:
return "Start Page Selection Is Wrong"
else:
start_page = start_page - 1
if end_page == -1:
end_page = total_pages
elif end_page < 1 or end_page > total_pages - 1:
return "End Page Selection Is Wrong"
else:
end_page = end_page
for i in range[start_page, end_page]:
output = PdfFileWriter[]
output.addPage[pdf_reader.getPage[i]]
with open[str[i + 1] + "_" + filename, "wb"] as outputStream:
output.write[outputStream]
def extract_text_algo_1[self, file]:
pdf_reader = PdfFileReader[open[file, 'rb']]
# creating a page object
pageObj = pdf_reader.getPage[0]
# extracting extract_text from page
text = pageObj.extractText[]
text = text.replace["\n", ""].replace["\t", ""]
return text
def extract_text_algo_2[self, file]:
pdfResourceManager = PDFResourceManager[]
retstr = StringIO[]
la_params = LAParams[]
device = TextConverter[pdfResourceManager, retstr, codec='utf-8', laparams=la_params]
fp = open[file, 'rb']
interpreter = PDFPageInterpreter[pdfResourceManager, device]
password = ""
max_pages = 0
caching = True
page_num = set[]
for page in PDFPage.get_pages[fp, page_num, maxpages=max_pages, password=password, caching=caching,
check_extractable=True]:
interpreter.process_page[page]
text = retstr.getvalue[]
text = text.replace["\t", ""].replace["\n", ""]
fp.close[]
device.close[]
retstr.close[]
return text
def extract_text[self, file]:
text1 = self.extract_text_algo_1[file]
text2 = self.extract_text_algo_2[file]
if len[text2] > len[str[text1]]:
return text2
else:
return text1
def extarct_table[self, file]:
# Read pdf into DataFrame
try:
df = tabula.read_pdf[file, output_format="csv"]
except:
print["Error Reading Table"]
return
print["\nPrinting Table Content: \n", df]
print["\nDone Printing Table Content\n"]
def tiff_header_for_CCITT[self, width, height, img_size, CCITT_group=4]:
tiff_header_struct = '