init
This commit is contained in:
commit
3973e55da1
109
convert.py
Normal file
109
convert.py
Normal file
@ -0,0 +1,109 @@
|
||||
# Kevin Mathews 7/20/2020 rev 1.01
|
||||
# PDF to CBR Converter
|
||||
# written in Python 3
|
||||
|
||||
# Script which converts PDF files to CBZ format.
|
||||
# Tested on both Linux and Windows
|
||||
|
||||
# manual input
|
||||
# Change read_dir and write_dir to your preference.
|
||||
read_dir = r'./' # folder where your PDFs are located
|
||||
write_dir = r'./' # folder where completed cbz files should go
|
||||
|
||||
# https://realpython.com/pdf-python/
|
||||
import os, sys
|
||||
from PyPDF2 import PdfFileReader
|
||||
from pdf2image import convert_from_path
|
||||
import tempfile
|
||||
import zipfile
|
||||
|
||||
# function to handle image conversion
|
||||
def get_pdf_photos(input_path, newZip):
|
||||
def empty_folder(folder_loc):
|
||||
for the_file in os.listdir(folder_loc):
|
||||
file_path = os.path.join(folder_loc, the_file)
|
||||
try:
|
||||
if os.path.isfile(file_path):
|
||||
os.unlink(file_path)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
pdb.set_trace()
|
||||
|
||||
def extract_information(pdf_path):
|
||||
try:
|
||||
with open(pdf_path, 'rb') as f:
|
||||
pdf = PdfFileReader(f)
|
||||
information = pdf.getDocumentInfo()
|
||||
number_of_pages = pdf.getNumPages()
|
||||
pageObj = pdf.getPage(0)
|
||||
full_text = pageObj.extractText()
|
||||
|
||||
txt = f"""
|
||||
Information about {pdf_path}:
|
||||
|
||||
Author: {information.author}
|
||||
Creator: {information.creator}
|
||||
Producer: {information.producer}
|
||||
Subject: {information.subject}
|
||||
Title: {information.title}
|
||||
Number of pages: {number_of_pages}
|
||||
"""
|
||||
except:
|
||||
information = '-'
|
||||
full_text = '-'
|
||||
|
||||
return information, full_text
|
||||
|
||||
# use tempfile for image processing
|
||||
print('\tgetting images from path...')
|
||||
with tempfile.TemporaryDirectory() as path:
|
||||
print('\t' + path)
|
||||
|
||||
# use convert_from_path to create list of images
|
||||
# https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
|
||||
convert_from_path(pdf_path=input_path, fmt='jpeg', output_file = '', output_folder=path)
|
||||
|
||||
print('\tgathered images.')
|
||||
|
||||
for picture_name in os.listdir(path):
|
||||
# save image to zip file
|
||||
newZip.write(os.path.join(path, picture_name), arcname = picture_name, compress_type=zipfile.ZIP_DEFLATED)
|
||||
|
||||
# function to create zip file and make cbz conversion more verbose
|
||||
def convert_pdf_to_comic(input_path, output_path):
|
||||
|
||||
# interperet input path
|
||||
read_dir, file_name = os.path.split(input_path)
|
||||
cb_file_name = os.path.splitext(file_name)[0]
|
||||
|
||||
# create zip file at working directory
|
||||
cb_file_path = os.path.join(output_path, cb_file_name + '.cbz')
|
||||
newZip = zipfile.ZipFile(cb_file_path, 'w')
|
||||
|
||||
print('\t' + file_name)
|
||||
get_pdf_photos(input_path, newZip)
|
||||
|
||||
# close zip file after completion
|
||||
newZip.close()
|
||||
print('\tsaved book', cb_file_name + '.cbz')
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
pdf_list = [i for i in os.listdir(read_dir) if i.endswith('.pdf') == True]
|
||||
no_of_pdfs = str(len(pdf_list))
|
||||
|
||||
n = 1
|
||||
for read_file in pdf_list:
|
||||
|
||||
input_path = os.path.join(read_dir, read_file)
|
||||
output_path = write_dir
|
||||
|
||||
# initial checks
|
||||
assert os.path.exists(input_path) == True # check that file exists
|
||||
assert os.path.splitext(input_path)[1] == '.pdf' # check that file is pdf
|
||||
|
||||
print('working on', n, 'of', no_of_pdfs)
|
||||
convert_pdf_to_comic(input_path, output_path)
|
||||
|
||||
print('done.')
|
||||
|
Loading…
Reference in New Issue
Block a user