#!/usr/bin/env python3 # pillow is a dependency import os import zipfile from PIL import Image import sys from io import BytesIO import re from xml.dom import minidom img_ext_regex = re.compile(r'^.*\.(jpg|jpeg|png)$', flags=re.IGNORECASE) cover_regex = re.compile(r'.*cover.*\.(jpg|jpeg|png)', flags=re.IGNORECASE) def get_cover_from_manifest(epub): rootfile_path, rootfile_root = _get_rootfile_root(epub) # find possible cover in meta cover_id = None for meta in rootfile_root.getElementsByTagName("meta"): if meta.getAttribute("name") == "cover": cover_id = meta.getAttribute("content") break # find the manifest element manifest = rootfile_root.getElementsByTagName("manifest")[0] for item in manifest.getElementsByTagName("item"): item_id = item.getAttribute("id") item_properties = item.getAttribute("properties") item_href = item.getAttribute("href") item_href_is_image = img_ext_regex.match(item_href.lower()) item_id_might_be_cover = item_id == cover_id or ('cover' in item_id and item_href_is_image) item_properties_might_be_cover = item_properties == cover_id or ('cover' in item_properties and item_href_is_image) if item_id_might_be_cover or item_properties_might_be_cover: return os.path.join(os.path.dirname(rootfile_path), item_href) return None def get_cover_by_guide(epub): rootfile_path, rootfile_root = _get_rootfile_root(epub) for ref in rootfile_root.getElementsByTagName("reference"): if ref.getAttribute("type") == "cover": cover_href = ref.getAttribute("href") cover_file_path = os.path.join(os.path.dirname(rootfile_path), cover_href) # is html cover_file = epub.open(cover_file_path) cover_dom = minidom.parseString(cover_file.read()) imgs = cover_dom.getElementsByTagName("img") if imgs: img = imgs[0] img_path = img.getAttribute("src") return os.path.relpath(os.path.join(os.path.dirname(cover_file_path), img_path)) return None def get_cover_by_filename(epub): no_matching_images = [] for fileinfo in epub.filelist: if cover_regex.match(fileinfo.filename): return fileinfo.filename if img_ext_regex.match(fileinfo.filename): no_matching_images.append(fileinfo) return _choose_best_image(no_matching_images) def _choose_best_image(images): if images: return max(images, key=lambda f: f.file_size) return None def _get_rootfile_root(epub): # open the main container container = epub.open("META-INF/container.xml") container_root = minidom.parseString(container.read()) # locate the rootfile elem = container_root.getElementsByTagName("rootfile")[0] rootfile_path = elem.getAttribute("full-path") # open the rootfile rootfile = epub.open(rootfile_path) return rootfile_path, minidom.parseString(rootfile.read()) def extract_cover(epub, output_path, size): extraction_strategies = [get_cover_from_manifest, get_cover_by_guide, get_cover_by_filename] for strategy in extraction_strategies: try: cover_path = strategy(epub) if cover_path: cover = epub.open(cover_path) im = Image.open(BytesIO(cover.read())) im.thumbnail((size, size), Image.LANCZOS) im.save(os.path.join(output_path, os.path.basename(input_file) + '.png'), "PNG") return True except Exception as ex: print("Error getting cover using %s: " % strategy.__name__, ex) return False if __name__ == '__main__': input_file = sys.argv[1] folder_path = '/tmp/epub/' if not os.path.exists(folder_path): os.makedirs(folder_path, exist_ok=True) output_path = os.path.join(folder_path, os.path.dirname(input_file)) size = int(sys.argv[2]) epub = zipfile.ZipFile(input_file, "r") if extract_cover(epub, output_path, size): exit(0) else: print("Error extracting cover") exit(1)