Hướng dẫn extracting text from pdf python - trích xuất văn bản từ pdf python

Tôi đang thêm mã để thực hiện điều này: nó đang hoạt động tốt cho tôi:

# This works in python 3
# required python packages
# tabula-py==1.0.0
# PyPDF2==1.26.0
# Pillow==4.0.0
# pdfminer.six==20170720

import os
import shutil
import warnings
from io import StringIO

import requests
import tabula
from PIL import Image
from PyPDF2 import PdfFileWriter, PdfFileReader
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage

warnings.filterwarnings["ignore"]


def download_file[url]:
    local_filename = url.split['/'][-1]
    local_filename = local_filename.replace["%20", "_"]
    r = requests.get[url, stream=True]
    print[r]
    with open[local_filename, 'wb'] as f:
        shutil.copyfileobj[r.raw, f]

    return local_filename


class PDFExtractor[]:
    def __init__[self, url]:
        self.url = url

    # Downloading File in local
    def break_pdf[self, filename, start_page=-1, end_page=-1]:
        pdf_reader = PdfFileReader[open[filename, "rb"]]
        # Reading each pdf one by one
        total_pages = pdf_reader.numPages
        if start_page == -1:
            start_page = 0
        elif start_page < 1 or start_page > total_pages:
            return "Start Page Selection Is Wrong"
        else:
            start_page = start_page - 1

        if end_page == -1:
            end_page = total_pages
        elif end_page < 1 or end_page > total_pages - 1:
            return "End Page Selection Is Wrong"
        else:
            end_page = end_page

        for i in range[start_page, end_page]:
            output = PdfFileWriter[]
            output.addPage[pdf_reader.getPage[i]]
            with open[str[i + 1] + "_" + filename, "wb"] as outputStream:
                output.write[outputStream]

    def extract_text_algo_1[self, file]:
        pdf_reader = PdfFileReader[open[file, 'rb']]
        # creating a page object
        pageObj = pdf_reader.getPage[0]

        # extracting extract_text from page
        text = pageObj.extractText[]
        text = text.replace["\n", ""].replace["\t", ""]
        return text

    def extract_text_algo_2[self, file]:
        pdfResourceManager = PDFResourceManager[]
        retstr = StringIO[]
        la_params = LAParams[]
        device = TextConverter[pdfResourceManager, retstr, codec='utf-8', laparams=la_params]
        fp = open[file, 'rb']
        interpreter = PDFPageInterpreter[pdfResourceManager, device]
        password = ""
        max_pages = 0
        caching = True
        page_num = set[]

        for page in PDFPage.get_pages[fp, page_num, maxpages=max_pages, password=password, caching=caching,
                                      check_extractable=True]:
            interpreter.process_page[page]

        text = retstr.getvalue[]
        text = text.replace["\t", ""].replace["\n", ""]

        fp.close[]
        device.close[]
        retstr.close[]
        return text

    def extract_text[self, file]:
        text1 = self.extract_text_algo_1[file]
        text2 = self.extract_text_algo_2[file]

        if len[text2] > len[str[text1]]:
            return text2
        else:
            return text1

    def extarct_table[self, file]:

        # Read pdf into DataFrame
        try:
            df = tabula.read_pdf[file, output_format="csv"]
        except:
            print["Error Reading Table"]
            return

        print["\nPrinting Table Content: \n", df]
        print["\nDone Printing Table Content\n"]

    def tiff_header_for_CCITT[self, width, height, img_size, CCITT_group=4]:
        tiff_header_struct = '


				
					

                 
	Bài Viết Liên Quan
	
	 	
		
		   
		   
		   
		
		
			Hướng dẫn what is a destructor in php? - hàm hủy trong php là gì?

		
	

		
		
		   
		   
		   
		
		
			Các bản linux tốt nhất

		
	

		
		
		   
		   
		   
		
		
			Lương nhân viên ngân hàng agribank 2023

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn can we run php file without local server? - chúng ta có thể chạy tệp php mà không có máy chủ cục bộ không?

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn what is * kwargs in python? - * kwargs trong python là gì?

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn html enter - html nhập

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn how do i reindex a series in python? - làm cách nào để lập lại chỉ mục một chuỗi trong python?

		
	

		
		
		   
		   
		   
		
		
			Mở rộng phân vùng ở cứng trên linux

		
	

		
		
		   
		   
		   
		
		
			Tử vi canh tuất năm 2023

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn how do you concatenate a variable and url in python? - làm thế nào để bạn nối một biến và url trong python?

		
	

		
		
		   
		   
		   
		
		
			Giá sh 2023 hôm nay

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn data analysis trong excel macbook - phân tích dữ liệu trong excel macbook

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn javascript string contains character regex - chuỗi javascript chứa ký tự regex

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn foreach loop in php example - vòng lặp foreach trong ví dụ php

		
	

		
		
		   
		   
		   
		
		
			Trường tôn đức thắng tuyển sinh 2023

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn how to create a table in javascript - cách tạo bảng trong javascript

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn 36 pi to php - 36 ngày và php

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn text to html converter - công cụ chuyển đổi văn bản sang html

		
	

		
		
		   
		   
		   
		
		
			Hướng dẫn how do i save a webpage as html in python? - làm cách nào để lưu một trang web dưới dạng html trong python?

		
	

		
		
		   
		   
		   
		
		
			Tử vi tuổi mậu tuất nam mạng sinh năm 2023

		
	

	
	




Toplist mới

 
	
	 
		#1
		
			Top 7 sự tích hồ gươm - ngữ văn lớp 6 2023
			7 tháng trước
		
	



	
	 
		#2
		
			Top 7 gdcd 6 bài 1 kết nối tri thức 2023
			7 tháng trước
		
	



	
	 
		#3
		
			Top 7 ý nghĩa của xây dựng gia đình văn hóa 2023
			7 tháng trước
		
	



	
	 
		#4
		
			Top 6 mẫu hợp đồng mượn đất làm nhà xưởng 2023
			7 tháng trước
		
	



	
	 
		#5
		
			Top 3 tổng tài biến thái tôi yêu anh tập 27 2023
			7 tháng trước
		
	



	
	 
		#6
		
			Top 6 kết thực phim mỹ nhân vô lệ 2023
			7 tháng trước
		
	



	
	 
		#7
		
			Top 9 trong những câu thơ sau câu nào sử dụng thành ngữ 2023
			7 tháng trước
		
	



	
	 
		#8
		
			Top 8 đề tài và chủ de của tác phẩm tắt đèn 2023
			7 tháng trước
		
	



	
	 
		#9
		
			Top 5 tiểu sử của thầy thích pháp hòa 2023
			7 tháng trước
		
	






		


	Bài mới nhất
	
	 	
		
		   
		   
		   
		
		
			B.n.n bị tố đạo văn 80 năm 2024

		
	

		
		
		   
		   
		   
		
		
			Cách dạy con học toán lớp 1 hiệu quả năm 2024

		
	

		
		
		   
		   
		   
		
		
			Làm thế nào để hết chuột rút bắp chân năm 2024

		
	

		
		
		   
		   
		   
		
		
			Bán đất đường trần quang diệu thành phố thanh hóa năm 2024

		
	

	
	
                 
	Chủ Đề
	
	
	
		  programming
		  Hỏi Đáp
		  Toplist
		  Là gì
		  Bài Tập
		  Địa Điểm Hay
		  Mẹo Hay
		  Học Tốt
		  Nghĩa của từ
		  Công Nghệ
		  Khỏe Đẹp
		  bao nhiêu
		  Top List
		  Tiếng anh
		  Bao nhiêu
		  Sản phẩm tốt
		  Xây Đựng
		  Ngôn ngữ
		  javascript
		  Ở đâu
		  Đại học
		  Hướng dẫn
		  Bài tập
		  Tại sao
		  Dịch 
		  So Sánh
		  Máy tính
		  Món Ngon
		  mẹo hay
		  Bao lâu
		  Thế nào
		  So sánh
		  Khoa Học
		  Vì sao
		  Lớp 9
		  Lớp 10