110 lines
4.0 KiB
Python
110 lines
4.0 KiB
Python
# Kevin Mathews 7/20/2020 rev 1.01
|
|
# PDF to CBR Converter
|
|
# written in Python 3
|
|
|
|
# Script which converts PDF files to CBZ format.
|
|
# Tested on both Linux and Windows
|
|
|
|
# manual input
|
|
# Change read_dir and write_dir to your preference.
|
|
read_dir = r'./' # folder where your PDFs are located
|
|
write_dir = r'./' # folder where completed cbz files should go
|
|
|
|
# https://realpython.com/pdf-python/
|
|
import os, sys
|
|
from PyPDF2 import PdfFileReader
|
|
from pdf2image import convert_from_path
|
|
import tempfile
|
|
import zipfile
|
|
|
|
# function to handle image conversion
|
|
def get_pdf_photos(input_path, newZip):
|
|
def empty_folder(folder_loc):
|
|
for the_file in os.listdir(folder_loc):
|
|
file_path = os.path.join(folder_loc, the_file)
|
|
try:
|
|
if os.path.isfile(file_path):
|
|
os.unlink(file_path)
|
|
except Exception as e:
|
|
print(e)
|
|
pdb.set_trace()
|
|
|
|
def extract_information(pdf_path):
|
|
try:
|
|
with open(pdf_path, 'rb') as f:
|
|
pdf = PdfFileReader(f)
|
|
information = pdf.getDocumentInfo()
|
|
number_of_pages = pdf.getNumPages()
|
|
pageObj = pdf.getPage(0)
|
|
full_text = pageObj.extractText()
|
|
|
|
txt = f"""
|
|
Information about {pdf_path}:
|
|
|
|
Author: {information.author}
|
|
Creator: {information.creator}
|
|
Producer: {information.producer}
|
|
Subject: {information.subject}
|
|
Title: {information.title}
|
|
Number of pages: {number_of_pages}
|
|
"""
|
|
except:
|
|
information = '-'
|
|
full_text = '-'
|
|
|
|
return information, full_text
|
|
|
|
# use tempfile for image processing
|
|
print('\tgetting images from path...')
|
|
with tempfile.TemporaryDirectory() as path:
|
|
print('\t' + path)
|
|
|
|
# use convert_from_path to create list of images
|
|
# https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
|
|
convert_from_path(pdf_path=input_path, fmt='jpeg', output_file = '', output_folder=path)
|
|
|
|
print('\tgathered images.')
|
|
|
|
for picture_name in os.listdir(path):
|
|
# save image to zip file
|
|
newZip.write(os.path.join(path, picture_name), arcname = picture_name, compress_type=zipfile.ZIP_DEFLATED)
|
|
|
|
# function to create zip file and make cbz conversion more verbose
|
|
def convert_pdf_to_comic(input_path, output_path):
|
|
|
|
# interperet input path
|
|
read_dir, file_name = os.path.split(input_path)
|
|
cb_file_name = os.path.splitext(file_name)[0]
|
|
|
|
# create zip file at working directory
|
|
cb_file_path = os.path.join(output_path, cb_file_name + '.cbz')
|
|
newZip = zipfile.ZipFile(cb_file_path, 'w')
|
|
|
|
print('\t' + file_name)
|
|
get_pdf_photos(input_path, newZip)
|
|
|
|
# close zip file after completion
|
|
newZip.close()
|
|
print('\tsaved book', cb_file_name + '.cbz')
|
|
|
|
if __name__ == "__main__":
|
|
|
|
pdf_list = [i for i in os.listdir(read_dir) if i.endswith('.pdf') == True]
|
|
no_of_pdfs = str(len(pdf_list))
|
|
|
|
n = 1
|
|
for read_file in pdf_list:
|
|
|
|
input_path = os.path.join(read_dir, read_file)
|
|
output_path = write_dir
|
|
|
|
# initial checks
|
|
assert os.path.exists(input_path) == True # check that file exists
|
|
assert os.path.splitext(input_path)[1] == '.pdf' # check that file is pdf
|
|
|
|
print('working on', n, 'of', no_of_pdfs)
|
|
convert_pdf_to_comic(input_path, output_path)
|
|
|
|
print('done.')
|
|
|