Hướng dẫn how do i scrape text from a pdf in python? - làm cách nào để trích xuất văn bản từ pdf trong python?

Tôi đang thêm mã để thực hiện điều này: nó đang hoạt động tốt cho tôi:

# This works in python 3
# required python packages
# tabula-py==1.0.0
# PyPDF2==1.26.0
# Pillow==4.0.0
# pdfminer.six==20170720

import os
import shutil
import warnings
from io import StringIO

import requests
import tabula
from PIL import Image
from PyPDF2 import PdfFileWriter, PdfFileReader
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage

warnings.filterwarnings["ignore"]


def download_file[url]:
    local_filename = url.split['/'][-1]
    local_filename = local_filename.replace["%20", "_"]
    r = requests.get[url, stream=True]
    print[r]
    with open[local_filename, 'wb'] as f:
        shutil.copyfileobj[r.raw, f]

    return local_filename


class PDFExtractor[]:
    def __init__[self, url]:
        self.url = url

    # Downloading File in local
    def break_pdf[self, filename, start_page=-1, end_page=-1]:
        pdf_reader = PdfFileReader[open[filename, "rb"]]
        # Reading each pdf one by one
        total_pages = pdf_reader.numPages
        if start_page == -1:
            start_page = 0
        elif start_page < 1 or start_page > total_pages:
            return "Start Page Selection Is Wrong"
        else:
            start_page = start_page - 1

        if end_page == -1:
            end_page = total_pages
        elif end_page < 1 or end_page > total_pages - 1:
            return "End Page Selection Is Wrong"
        else:
            end_page = end_page

        for i in range[start_page, end_page]:
            output = PdfFileWriter[]
            output.addPage[pdf_reader.getPage[i]]
            with open[str[i + 1] + "_" + filename, "wb"] as outputStream:
                output.write[outputStream]

    def extract_text_algo_1[self, file]:
        pdf_reader = PdfFileReader[open[file, 'rb']]
        # creating a page object
        pageObj = pdf_reader.getPage[0]

        # extracting extract_text from page
        text = pageObj.extractText[]
        text = text.replace["\n", ""].replace["\t", ""]
        return text

    def extract_text_algo_2[self, file]:
        pdfResourceManager = PDFResourceManager[]
        retstr = StringIO[]
        la_params = LAParams[]
        device = TextConverter[pdfResourceManager, retstr, codec='utf-8', laparams=la_params]
        fp = open[file, 'rb']
        interpreter = PDFPageInterpreter[pdfResourceManager, device]
        password = ""
        max_pages = 0
        caching = True
        page_num = set[]

        for page in PDFPage.get_pages[fp, page_num, maxpages=max_pages, password=password, caching=caching,
                                      check_extractable=True]:
            interpreter.process_page[page]

        text = retstr.getvalue[]
        text = text.replace["\t", ""].replace["\n", ""]

        fp.close[]
        device.close[]
        retstr.close[]
        return text

    def extract_text[self, file]:
        text1 = self.extract_text_algo_1[file]
        text2 = self.extract_text_algo_2[file]

        if len[text2] > len[str[text1]]:
            return text2
        else:
            return text1

    def extarct_table[self, file]:

        # Read pdf into DataFrame
        try:
            df = tabula.read_pdf[file, output_format="csv"]
        except:
            print["Error Reading Table"]
            return

        print["\nPrinting Table Content: \n", df]
        print["\nDone Printing Table Content\n"]

    def tiff_header_for_CCITT[self, width, height, img_size, CCITT_group=4]:
        tiff_header_struct = '

Bài Viết Liên Quan

Chủ Đề