From 3973e55da1b785a1972cbd7197b4f0323ba6cf26 Mon Sep 17 00:00:00 2001 From: basecamp88 Date: Wed, 29 Mar 2023 20:44:47 -0400 Subject: [PATCH] init --- convert.py | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 convert.py diff --git a/convert.py b/convert.py new file mode 100644 index 0000000..d7ec9da --- /dev/null +++ b/convert.py @@ -0,0 +1,109 @@ +# Kevin Mathews 7/20/2020 rev 1.01 +# PDF to CBR Converter +# written in Python 3 + +# Script which converts PDF files to CBZ format. +# Tested on both Linux and Windows + +# manual input +# Change read_dir and write_dir to your preference. +read_dir = r'./' # folder where your PDFs are located +write_dir = r'./' # folder where completed cbz files should go + +# https://realpython.com/pdf-python/ +import os, sys +from PyPDF2 import PdfFileReader +from pdf2image import convert_from_path +import tempfile +import zipfile + +# function to handle image conversion +def get_pdf_photos(input_path, newZip): + def empty_folder(folder_loc): + for the_file in os.listdir(folder_loc): + file_path = os.path.join(folder_loc, the_file) + try: + if os.path.isfile(file_path): + os.unlink(file_path) + except Exception as e: + print(e) + pdb.set_trace() + + def extract_information(pdf_path): + try: + with open(pdf_path, 'rb') as f: + pdf = PdfFileReader(f) + information = pdf.getDocumentInfo() + number_of_pages = pdf.getNumPages() + pageObj = pdf.getPage(0) + full_text = pageObj.extractText() + + txt = f""" + Information about {pdf_path}: + + Author: {information.author} + Creator: {information.creator} + Producer: {information.producer} + Subject: {information.subject} + Title: {information.title} + Number of pages: {number_of_pages} + """ + except: + information = '-' + full_text = '-' + + return information, full_text + + # use tempfile for image processing + print('\tgetting images from path...') + with tempfile.TemporaryDirectory() as path: + print('\t' + path) + + # use convert_from_path to create list of images + # https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html + convert_from_path(pdf_path=input_path, fmt='jpeg', output_file = '', output_folder=path) + + print('\tgathered images.') + + for picture_name in os.listdir(path): + # save image to zip file + newZip.write(os.path.join(path, picture_name), arcname = picture_name, compress_type=zipfile.ZIP_DEFLATED) + +# function to create zip file and make cbz conversion more verbose +def convert_pdf_to_comic(input_path, output_path): + + # interperet input path + read_dir, file_name = os.path.split(input_path) + cb_file_name = os.path.splitext(file_name)[0] + + # create zip file at working directory + cb_file_path = os.path.join(output_path, cb_file_name + '.cbz') + newZip = zipfile.ZipFile(cb_file_path, 'w') + + print('\t' + file_name) + get_pdf_photos(input_path, newZip) + + # close zip file after completion + newZip.close() + print('\tsaved book', cb_file_name + '.cbz') + +if __name__ == "__main__": + + pdf_list = [i for i in os.listdir(read_dir) if i.endswith('.pdf') == True] + no_of_pdfs = str(len(pdf_list)) + + n = 1 + for read_file in pdf_list: + + input_path = os.path.join(read_dir, read_file) + output_path = write_dir + + # initial checks + assert os.path.exists(input_path) == True # check that file exists + assert os.path.splitext(input_path)[1] == '.pdf' # check that file is pdf + + print('working on', n, 'of', no_of_pdfs) + convert_pdf_to_comic(input_path, output_path) + + print('done.') +