init

2023-03-29 20:44:47 -04:00 · 2023-03-29 20:44:47 -04:00 · 3973e55da1
commit 3973e55da1
1 changed files with 109 additions and 0 deletions
--- a/convert.py
+++ b/convert.py
@ -0,0 +1,109 @@
+# Kevin Mathews 7/20/2020 rev 1.01
+# PDF to CBR Converter
+# written in Python 3
+
+# Script which converts PDF files to CBZ format.
+# Tested on both Linux and Windows
+
+# manual input
+# Change read_dir and write_dir to your preference.
+read_dir = r'./' # folder where your PDFs are located
+write_dir = r'./' # folder where completed cbz files should go
+
+# https://realpython.com/pdf-python/
+import os, sys
+from PyPDF2 import PdfFileReader
+from pdf2image import convert_from_path
+import tempfile
+import zipfile
+
+# function to handle image conversion
+def get_pdf_photos(input_path, newZip):
+        def empty_folder(folder_loc):
+                for the_file in os.listdir(folder_loc):
+                        file_path = os.path.join(folder_loc, the_file)
+                        try:
+                                if os.path.isfile(file_path):
+                                        os.unlink(file_path)
+                        except Exception as e:
+                                print(e)
+                                pdb.set_trace()
+
+        def extract_information(pdf_path):
+                try:
+                        with open(pdf_path, 'rb') as f:
+                                pdf = PdfFileReader(f)
+                                information = pdf.getDocumentInfo()
+                                number_of_pages = pdf.getNumPages()
+                                pageObj = pdf.getPage(0)
+                                full_text = pageObj.extractText()
+
+                        txt = f"""
+                        Information about {pdf_path}:
+
+                        Author: {information.author}
+                        Creator: {information.creator}
+                        Producer: {information.producer}
+                        Subject: {information.subject}
+                        Title: {information.title}
+                        Number of pages: {number_of_pages}
+                        """
+                except:
+                        information = '-'
+                        full_text = '-'
+
+                return information, full_text
+
+        # use tempfile for image processing
+        print('\tgetting images from path...')
+        with tempfile.TemporaryDirectory() as path:
+                print('\t' + path)
+
+                # use convert_from_path to create list of images
+                # https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
+                convert_from_path(pdf_path=input_path, fmt='jpeg', output_file = '', output_folder=path)
+
+                print('\tgathered images.')
+
+                for picture_name in os.listdir(path):
+                        # save image to zip file
+                        newZip.write(os.path.join(path, picture_name), arcname = picture_name, compress_type=zipfile.ZIP_DEFLATED)
+
+# function to create zip file and make cbz conversion more verbose
+def convert_pdf_to_comic(input_path, output_path):
+
+        # interperet input path
+        read_dir, file_name = os.path.split(input_path)
+        cb_file_name = os.path.splitext(file_name)[0]
+
+        # create zip file at working directory
+        cb_file_path = os.path.join(output_path, cb_file_name + '.cbz')
+        newZip = zipfile.ZipFile(cb_file_path, 'w')
+
+        print('\t' + file_name)
+        get_pdf_photos(input_path, newZip)
+
+        # close zip file after completion
+        newZip.close()
+        print('\tsaved book', cb_file_name + '.cbz')
+
+if __name__ == "__main__":
+
+        pdf_list = [i for i in os.listdir(read_dir) if i.endswith('.pdf') == True]
+        no_of_pdfs = str(len(pdf_list))
+
+        n = 1
+        for read_file in pdf_list:
+
+                input_path = os.path.join(read_dir, read_file)
+                output_path = write_dir
+
+                # initial checks
+                assert os.path.exists(input_path) == True # check that file exists
+                assert os.path.splitext(input_path)[1] == '.pdf' # check that file is pdf
+
+                print('working on', n, 'of', no_of_pdfs)
+                convert_pdf_to_comic(input_path, output_path)
+
+        print('done.')
+