programming python Pdfplumber PyMuPDF PyPDF2 Read PDF Python

Hướng dẫn how do i scrape text from a pdf in python? - làm cách nào để trích xuất văn bản từ pdf trong python?

Tôi đang thêm mã để thực hiện điều này: nó đang hoạt động tốt cho tôi:

# This works in python 3
# required python packages
# tabula-py==1.0.0
# PyPDF2==1.26.0
# Pillow==4.0.0
# pdfminer.six==20170720

import os
import shutil
import warnings
from io import StringIO

import requests
import tabula
from PIL import Image
from PyPDF2 import PdfFileWriter, PdfFileReader
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage

warnings.filterwarnings["ignore"]


def download_file[url]:
    local_filename = url.split['/'][-1]
    local_filename = local_filename.replace["%20", "_"]
    r = requests.get[url, stream=True]
    print[r]
    with open[local_filename, 'wb'] as f:
        shutil.copyfileobj[r.raw, f]

    return local_filename


class PDFExtractor[]:
    def __init__[self, url]:
        self.url = url

    # Downloading File in local
    def break_pdf[self, filename, start_page=-1, end_page=-1]:
        pdf_reader = PdfFileReader[open[filename, "rb"]]
        # Reading each pdf one by one
        total_pages = pdf_reader.numPages
        if start_page == -1:
            start_page = 0
        elif start_page < 1 or start_page > total_pages:
            return "Start Page Selection Is Wrong"
        else:
            start_page = start_page - 1

        if end_page == -1:
            end_page = total_pages
        elif end_page < 1 or end_page > total_pages - 1:
            return "End Page Selection Is Wrong"
        else:
            end_page = end_page

        for i in range[start_page, end_page]:
            output = PdfFileWriter[]
            output.addPage[pdf_reader.getPage[i]]
            with open[str[i + 1] + "_" + filename, "wb"] as outputStream:
                output.write[outputStream]

    def extract_text_algo_1[self, file]:
        pdf_reader = PdfFileReader[open[file, 'rb']]
        # creating a page object
        pageObj = pdf_reader.getPage[0]

        # extracting extract_text from page
        text = pageObj.extractText[]
        text = text.replace["\n", ""].replace["\t", ""]
        return text

    def extract_text_algo_2[self, file]:
        pdfResourceManager = PDFResourceManager[]
        retstr = StringIO[]
        la_params = LAParams[]
        device = TextConverter[pdfResourceManager, retstr, codec='utf-8', laparams=la_params]
        fp = open[file, 'rb']
        interpreter = PDFPageInterpreter[pdfResourceManager, device]
        password = ""
        max_pages = 0
        caching = True
        page_num = set[]

        for page in PDFPage.get_pages[fp, page_num, maxpages=max_pages, password=password, caching=caching,
                                      check_extractable=True]:
            interpreter.process_page[page]

        text = retstr.getvalue[]
        text = text.replace["\t", ""].replace["\n", ""]

        fp.close[]
        device.close[]
        retstr.close[]
        return text

    def extract_text[self, file]:
        text1 = self.extract_text_algo_1[file]
        text2 = self.extract_text_algo_2[file]

        if len[text2] > len[str[text1]]:
            return text2
        else:
            return text1

    def extarct_table[self, file]:

        # Read pdf into DataFrame
        try:
            df = tabula.read_pdf[file, output_format="csv"]
        except:
            print["Error Reading Table"]
            return

        print["\nPrinting Table Content: \n", df]
        print["\nDone Printing Table Content\n"]

    def tiff_header_for_CCITT[self, width, height, img_size, CCITT_group=4]:
        tiff_header_struct = '


				
					

                 
	Bài Viết Liên Quan
	
	 	
		
		   
		   
		   
		
		
			Hướng dẫn code slider wordpress - thanh trượt mã wordpress

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn php image workshop - hội thảo hình ảnh php

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn crumb sh hello python - crumb sh xin chào trăn

		
	

		
		
		   
		   
		   
		
		
			Tỷ phú giàu nhất thế giới 2023

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn can you write to a text file in javascript? - bạn có thể ghi vào một tệp văn bản bằng javascript không?

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn combine two list of dictionaries python - kết hợp hai danh sách từ điển python

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn how do i center text in laravel in excel? - làm cách nào để căn giữa văn bản trong laravel trong excel?

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn what programming language is excel written in - excel được viết bằng ngôn ngữ lập trình nào

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn data types and expressions in python - kiểu dữ liệu và biểu thức trong python

		
	

		
		
		   
		   
		   
		
		
			Bói toán tử vi 2023

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn how do you make a copy in python? - làm thế nào để bạn tạo một bản sao trong python?

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn redirect stdout python - redirect stdout python

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn can we read csv file using javascript? - chúng ta có thể đọc tệp csv bằng javascript không?

		
	

		
		
		   
		   
		   
		
		
			So sánh bootloader windows và linux

		
	

		
		
		   
		   
		   
		
		
			Nữ 1995 năm 2024

		
	

		
		
		   
		   
		   
		
		
			Xem dung luong cac thu muc tren linux

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn if not empty php - nếu không có php trống

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn how do i search for text in a table in excel? - làm cách nào để tìm kiếm văn bản trong bảng trong excel?

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn php upload file check if exists - kiểm tra tệp tải lên php nếu tồn tại

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn multiselect dropdown bootstrap - bootstrap thả xuống multiselect

		
	

	
	




Toplist mới

 
	
	 
		#1
		
			Top 7 sự tích hồ gươm - ngữ văn lớp 6 2023
			5 tháng trước
		
	



	
	 
		#2
		
			Top 7 gdcd 6 bài 1 kết nối tri thức 2023
			5 tháng trước
		
	



	
	 
		#3
		
			Top 7 ý nghĩa của xây dựng gia đình văn hóa 2023
			5 tháng trước
		
	



	
	 
		#4
		
			Top 6 mẫu hợp đồng mượn đất làm nhà xưởng 2023
			5 tháng trước
		
	



	
	 
		#5
		
			Top 3 tổng tài biến thái tôi yêu anh tập 27 2023
			5 tháng trước
		
	



	
	 
		#6
		
			Top 6 kết thực phim mỹ nhân vô lệ 2023
			5 tháng trước
		
	



	
	 
		#7
		
			Top 9 trong những câu thơ sau câu nào sử dụng thành ngữ 2023
			5 tháng trước
		
	



	
	 
		#8
		
			Top 8 đề tài và chủ de của tác phẩm tắt đèn 2023
			5 tháng trước
		
	



	
	 
		#9
		
			Top 5 tiểu sử của thầy thích pháp hòa 2023
			5 tháng trước
		
	






		


	Bài mới nhất
	
	 	
		
		   
		   
		   
		
		
			Banner cỡ lớn treo ngoài đường tiếng anh là gì năm 2024

		
	

		
		
		   
		   
		   
		
		
			Top hãng mặt nạ nội địa trung quốc năm 2024

		
	

		
		
		   
		   
		   
		
		
			Giải hóa 8 bài nồng độ dung dịch năm 2024

		
	

		
		
		   
		   
		   
		
		
			Cường hóa lên thẳng 15 trong nháy mắt năm 2024

		
	

		
		
		   
		   
		   
		
		
			Phòng khám trung nguyện ở bình đại bến tre năm 2024

		
	

		
		
		   
		   
		   
		
		
			Cải lương chi bảo là gì năm 2024

		
	

		
		
		   
		   
		   
		
		
			Bài tập hỗn hợp kim loại tác dụng với hno3 năm 2024

		
	

	
	
                 
	Chủ Đề
	
	
	
		  programming
		  Hỏi Đáp
		  Toplist
		  Là gì
		  Bài Tập
		  Địa Điểm Hay
		  Mẹo Hay
		  Học Tốt
		  Nghĩa của từ
		  Công Nghệ
		  Khỏe Đẹp
		  bao nhiêu
		  Top List
		  Tiếng anh
		  Bao nhiêu
		  Sản phẩm tốt
		  Xây Đựng
		  Ngôn ngữ
		  javascript
		  Ở đâu
		  Đại học
		  Hướng dẫn
		  Bài tập
		  Tại sao
		  Dịch 
		  So Sánh
		  Máy tính
		  Món Ngon
		  Bao lâu
		  mẹo hay
		  Thế nào
		  So sánh
		  Khoa Học
		  Vì sao
		  Lớp 9
		  Lớp 10