pdf-to-cbz/convert.py

# Kevin Mathews 7/20/2020 rev 1.01
# PDF to CBR Converter
# written in Python 3

# Script which converts PDF files to CBZ format.
# Tested on both Linux and Windows

# manual input
# Change read_dir and write_dir to your preference.
read_dir = r'./' # folder where your PDFs are located
write_dir = r'./' # folder where completed cbz files should go

# https://realpython.com/pdf-python/
import os, sys
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_path
import tempfile
import zipfile

# function to handle image conversion
def get_pdf_photos(input_path, newZip):
        def empty_folder(folder_loc):
                for the_file in os.listdir(folder_loc):
                        file_path = os.path.join(folder_loc, the_file)
                        try:
                                if os.path.isfile(file_path):
                                        os.unlink(file_path)
                        except Exception as e:
                                print(e)
                                pdb.set_trace()

        def extract_information(pdf_path):
                try:
                        with open(pdf_path, 'rb') as f:
                                pdf = PdfFileReader(f)
                                information = pdf.getDocumentInfo()
                                number_of_pages = pdf.getNumPages()
                                pageObj = pdf.getPage(0)
                                full_text = pageObj.extractText()

                        txt = f"""
                        Information about {pdf_path}:

                        Author: {information.author}
                        Creator: {information.creator}
                        Producer: {information.producer}
                        Subject: {information.subject}
                        Title: {information.title}
                        Number of pages: {number_of_pages}
                        """
                except:
                        information = '-'
                        full_text = '-'

                return information, full_text

        # use tempfile for image processing
        print('\tgetting images from path...')
        with tempfile.TemporaryDirectory() as path:
                print('\t' + path)

                # use convert_from_path to create list of images
                # https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
                convert_from_path(pdf_path=input_path, fmt='jpeg', output_file = '', output_folder=path)

                print('\tgathered images.')

                for picture_name in os.listdir(path):
                        # save image to zip file
                        newZip.write(os.path.join(path, picture_name), arcname = picture_name, compress_type=zipfile.ZIP_DEFLATED)

# function to create zip file and make cbz conversion more verbose
def convert_pdf_to_comic(input_path, output_path):

        # interperet input path
        read_dir, file_name = os.path.split(input_path)
        cb_file_name = os.path.splitext(file_name)[0]

        # create zip file at working directory
        cb_file_path = os.path.join(output_path, cb_file_name + '.cbz')
        newZip = zipfile.ZipFile(cb_file_path, 'w')

        print('\t' + file_name)
        get_pdf_photos(input_path, newZip)

        # close zip file after completion
        newZip.close()
        print('\tsaved book', cb_file_name + '.cbz')

if __name__ == "__main__":

        pdf_list = [i for i in os.listdir(read_dir) if i.endswith('.pdf') == True]
        no_of_pdfs = str(len(pdf_list))

        n = 1
        for read_file in pdf_list:

                input_path = os.path.join(read_dir, read_file)
                output_path = write_dir

                # initial checks
                assert os.path.exists(input_path) == True # check that file exists
                assert os.path.splitext(input_path)[1] == '.pdf' # check that file is pdf

                print('working on', n, 'of', no_of_pdfs)
                convert_pdf_to_comic(input_path, output_path)

        print('done.')