init

2023-03-29 20:44:47 -04:00 · 2023-03-29 20:44:47 -04:00 · 3973e55da1
commit 3973e55da1
1 changed files with 109 additions and 0 deletions
--- a/convert.py
+++ b/convert.py
@ -0,0 +1,109 @@
 # Kevin Mathews 7/20/2020 rev 1.01
 # PDF to CBR Converter
 # written in Python 3
 # Script which converts PDF files to CBZ format.
 # Tested on both Linux and Windows
 # manual input
 # Change read_dir and write_dir to your preference.
 read_dir = r'./' # folder where your PDFs are located
 write_dir = r'./' # folder where completed cbz files should go
 # https://realpython.com/pdf-python/
 import os, sys
 from PyPDF2 import PdfFileReader
 from pdf2image import convert_from_path
 import tempfile
 import zipfile
 # function to handle image conversion
 def get_pdf_photos(input_path, newZip):
        def empty_folder(folder_loc):
                for the_file in os.listdir(folder_loc):
                        file_path = os.path.join(folder_loc, the_file)
                        try:
                                if os.path.isfile(file_path):
                                        os.unlink(file_path)
                        except Exception as e:
                                print(e)
                                pdb.set_trace()
        def extract_information(pdf_path):
                try:
                        with open(pdf_path, 'rb') as f:
                                pdf = PdfFileReader(f)
                                information = pdf.getDocumentInfo()
                                number_of_pages = pdf.getNumPages()
                                pageObj = pdf.getPage(0)
                                full_text = pageObj.extractText()
                        txt = f"""
                        Information about {pdf_path}:
                        Author: {information.author}
                        Creator: {information.creator}
                        Producer: {information.producer}
                        Subject: {information.subject}
                        Title: {information.title}
                        Number of pages: {number_of_pages}
                        """
                except:
                        information = '-'
                        full_text = '-'
                return information, full_text
        # use tempfile for image processing
        print('\tgetting images from path...')
        with tempfile.TemporaryDirectory() as path:
                print('\t' + path)
                # use convert_from_path to create list of images
                # https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
                convert_from_path(pdf_path=input_path, fmt='jpeg', output_file = '', output_folder=path)
                print('\tgathered images.')
                for picture_name in os.listdir(path):
                        # save image to zip file
                        newZip.write(os.path.join(path, picture_name), arcname = picture_name, compress_type=zipfile.ZIP_DEFLATED)
 # function to create zip file and make cbz conversion more verbose
 def convert_pdf_to_comic(input_path, output_path):
        # interperet input path
        read_dir, file_name = os.path.split(input_path)
        cb_file_name = os.path.splitext(file_name)[0]
        # create zip file at working directory
        cb_file_path = os.path.join(output_path, cb_file_name + '.cbz')
        newZip = zipfile.ZipFile(cb_file_path, 'w')
        print('\t' + file_name)
        get_pdf_photos(input_path, newZip)
        # close zip file after completion
        newZip.close()
        print('\tsaved book', cb_file_name + '.cbz')
 if __name__ == "__main__":
        pdf_list = [i for i in os.listdir(read_dir) if i.endswith('.pdf') == True]
        no_of_pdfs = str(len(pdf_list))
        n = 1
        for read_file in pdf_list:
                input_path = os.path.join(read_dir, read_file)
                output_path = write_dir
                # initial checks
                assert os.path.exists(input_path) == True # check that file exists
                assert os.path.splitext(input_path)[1] == '.pdf' # check that file is pdf
                print('working on', n, 'of', no_of_pdfs)
                convert_pdf_to_comic(input_path, output_path)
        print('done.')