pdf-to-cbz/convert.py

110 lines
4.0 KiB
Python
Raw Normal View History

2023-03-29 20:44:47 -04:00
# Kevin Mathews 7/20/2020 rev 1.01
# PDF to CBR Converter
# written in Python 3
# Script which converts PDF files to CBZ format.
# Tested on both Linux and Windows
# manual input
# Change read_dir and write_dir to your preference.
read_dir = r'./' # folder where your PDFs are located
write_dir = r'./' # folder where completed cbz files should go
# https://realpython.com/pdf-python/
import os, sys
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_path
import tempfile
import zipfile
# function to handle image conversion
def get_pdf_photos(input_path, newZip):
def empty_folder(folder_loc):
for the_file in os.listdir(folder_loc):
file_path = os.path.join(folder_loc, the_file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
except Exception as e:
print(e)
pdb.set_trace()
def extract_information(pdf_path):
try:
with open(pdf_path, 'rb') as f:
pdf = PdfFileReader(f)
information = pdf.getDocumentInfo()
number_of_pages = pdf.getNumPages()
pageObj = pdf.getPage(0)
full_text = pageObj.extractText()
txt = f"""
Information about {pdf_path}:
Author: {information.author}
Creator: {information.creator}
Producer: {information.producer}
Subject: {information.subject}
Title: {information.title}
Number of pages: {number_of_pages}
"""
except:
information = '-'
full_text = '-'
return information, full_text
# use tempfile for image processing
print('\tgetting images from path...')
with tempfile.TemporaryDirectory() as path:
print('\t' + path)
# use convert_from_path to create list of images
# https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
convert_from_path(pdf_path=input_path, fmt='jpeg', output_file = '', output_folder=path)
print('\tgathered images.')
for picture_name in os.listdir(path):
# save image to zip file
newZip.write(os.path.join(path, picture_name), arcname = picture_name, compress_type=zipfile.ZIP_DEFLATED)
# function to create zip file and make cbz conversion more verbose
def convert_pdf_to_comic(input_path, output_path):
# interperet input path
read_dir, file_name = os.path.split(input_path)
cb_file_name = os.path.splitext(file_name)[0]
# create zip file at working directory
cb_file_path = os.path.join(output_path, cb_file_name + '.cbz')
newZip = zipfile.ZipFile(cb_file_path, 'w')
print('\t' + file_name)
get_pdf_photos(input_path, newZip)
# close zip file after completion
newZip.close()
print('\tsaved book', cb_file_name + '.cbz')
if __name__ == "__main__":
pdf_list = [i for i in os.listdir(read_dir) if i.endswith('.pdf') == True]
no_of_pdfs = str(len(pdf_list))
n = 1
for read_file in pdf_list:
input_path = os.path.join(read_dir, read_file)
output_path = write_dir
# initial checks
assert os.path.exists(input_path) == True # check that file exists
assert os.path.splitext(input_path)[1] == '.pdf' # check that file is pdf
print('working on', n, 'of', no_of_pdfs)
convert_pdf_to_comic(input_path, output_path)
print('done.')