pdf-to-cbz/convert.py

# Kevin Mathews 7/20/2020 rev 1.01
# PDF to CBR Converter
# written in Python 3

# Script which converts PDF files to CBZ format.
# Tested on both Linux and Windows

# manual input
# Change read_dir and write_dir to your preference.
read_dir = r'./' # folder where your PDFs are located
write_dir = r'./' # folder where completed cbz files should go

# https://realpython.com/pdf-python/
import os, sys
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_path
import tempfile
import zipfile

# function to handle image conversion
def get_pdf_photos(input_path, newZip):
        def empty_folder(folder_loc):
                for the_file in os.listdir(folder_loc):
                        file_path = os.path.join(folder_loc, the_file)
                        try:
                                if os.path.isfile(file_path):
                                        os.unlink(file_path)
                        except Exception as e:
                                print(e)
                                pdb.set_trace()

        def extract_information(pdf_path):
                try:
                        with open(pdf_path, 'rb') as f:
                                pdf = PdfFileReader(f)
                                information = pdf.getDocumentInfo()
                                number_of_pages = pdf.getNumPages()
                                pageObj = pdf.getPage(0)
                                full_text = pageObj.extractText()

                        txt = f"""
                        Information about {pdf_path}:

                        Author: {information.author}
                        Creator: {information.creator}
                        Producer: {information.producer}
                        Subject: {information.subject}
                        Title: {information.title}
                        Number of pages: {number_of_pages}
                        """
                except:
                        information = '-'
                        full_text = '-'

                return information, full_text

        # use tempfile for image processing
        print('\tgetting images from path...')
        with tempfile.TemporaryDirectory() as path:
                print('\t' + path)

                # use convert_from_path to create list of images
                # https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
                convert_from_path(pdf_path=input_path, fmt='jpeg', output_file = '', output_folder=path)

                print('\tgathered images.')

                for picture_name in os.listdir(path):
                        # save image to zip file
                        newZip.write(os.path.join(path, picture_name), arcname = picture_name, compress_type=zipfile.ZIP_DEFLATED)

# function to create zip file and make cbz conversion more verbose
def convert_pdf_to_comic(input_path, output_path):

        # interperet input path
        read_dir, file_name = os.path.split(input_path)
        cb_file_name = os.path.splitext(file_name)[0]

        # create zip file at working directory
        cb_file_path = os.path.join(output_path, cb_file_name + '.cbz')
        newZip = zipfile.ZipFile(cb_file_path, 'w')

        print('\t' + file_name)
        get_pdf_photos(input_path, newZip)

        # close zip file after completion
        newZip.close()
        print('\tsaved book', cb_file_name + '.cbz')

if __name__ == "__main__":

        pdf_list = [i for i in os.listdir(read_dir) if i.endswith('.pdf') == True]
        no_of_pdfs = str(len(pdf_list))

        n = 1
        for read_file in pdf_list:

                input_path = os.path.join(read_dir, read_file)
                output_path = write_dir

                # initial checks
                assert os.path.exists(input_path) == True # check that file exists
                assert os.path.splitext(input_path)[1] == '.pdf' # check that file is pdf

                print('working on', n, 'of', no_of_pdfs)
                convert_pdf_to_comic(input_path, output_path)

        print('done.')
init 2023-03-29 20:44:47 -04:00			`# Kevin Mathews 7/20/2020 rev 1.01`
			`# PDF to CBR Converter`
			`# written in Python 3`

			`# Script which converts PDF files to CBZ format.`
			`# Tested on both Linux and Windows`

			`# manual input`
			`# Change read_dir and write_dir to your preference.`
			`read_dir = r'./' # folder where your PDFs are located`
			`write_dir = r'./' # folder where completed cbz files should go`

			`# https://realpython.com/pdf-python/`
			`import os, sys`
			`from PyPDF2 import PdfFileReader`
			`from pdf2image import convert_from_path`
			`import tempfile`
			`import zipfile`

			`# function to handle image conversion`
			`def get_pdf_photos(input_path, newZip):`
			`def empty_folder(folder_loc):`
			`for the_file in os.listdir(folder_loc):`
			`file_path = os.path.join(folder_loc, the_file)`
			`try:`
			`if os.path.isfile(file_path):`
			`os.unlink(file_path)`
			`except Exception as e:`
			`print(e)`
			`pdb.set_trace()`

			`def extract_information(pdf_path):`
			`try:`
			`with open(pdf_path, 'rb') as f:`
			`pdf = PdfFileReader(f)`
			`information = pdf.getDocumentInfo()`
			`number_of_pages = pdf.getNumPages()`
			`pageObj = pdf.getPage(0)`
			`full_text = pageObj.extractText()`

			`txt = f"""`
			`Information about {pdf_path}:`

			`Author: {information.author}`
			`Creator: {information.creator}`
			`Producer: {information.producer}`
			`Subject: {information.subject}`
			`Title: {information.title}`
			`Number of pages: {number_of_pages}`
			`"""`
			`except:`
			`information = '-'`
			`full_text = '-'`

			`return information, full_text`

			`# use tempfile for image processing`
			`print('\tgetting images from path...')`
			`with tempfile.TemporaryDirectory() as path:`
			`print('\t' + path)`

			`# use convert_from_path to create list of images`
			`# https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html`
			`convert_from_path(pdf_path=input_path, fmt='jpeg', output_file = '', output_folder=path)`

			`print('\tgathered images.')`

			`for picture_name in os.listdir(path):`
			`# save image to zip file`
			`newZip.write(os.path.join(path, picture_name), arcname = picture_name, compress_type=zipfile.ZIP_DEFLATED)`

			`# function to create zip file and make cbz conversion more verbose`
			`def convert_pdf_to_comic(input_path, output_path):`

			`# interperet input path`
			`read_dir, file_name = os.path.split(input_path)`
			`cb_file_name = os.path.splitext(file_name)[0]`

			`# create zip file at working directory`
			`cb_file_path = os.path.join(output_path, cb_file_name + '.cbz')`
			`newZip = zipfile.ZipFile(cb_file_path, 'w')`

			`print('\t' + file_name)`
			`get_pdf_photos(input_path, newZip)`

			`# close zip file after completion`
			`newZip.close()`
			`print('\tsaved book', cb_file_name + '.cbz')`

			`if __name__ == "__main__":`

			`pdf_list = [i for i in os.listdir(read_dir) if i.endswith('.pdf') == True]`
			`no_of_pdfs = str(len(pdf_list))`

			`n = 1`
			`for read_file in pdf_list:`

			`input_path = os.path.join(read_dir, read_file)`
			`output_path = write_dir`

			`# initial checks`
			`assert os.path.exists(input_path) == True # check that file exists`
			`assert os.path.splitext(input_path)[1] == '.pdf' # check that file is pdf`

			`print('working on', n, 'of', no_of_pdfs)`
			`convert_pdf_to_comic(input_path, output_path)`

			`print('done.')`