#!/usr/bin/env python """ A simple script that takes two or more arguments representing a file to which messages will be written and one or directory paths identifying a root directory from which to do a recursive walk, checking each PDF file (identified by extension of '.pdf') using Poppler to see if it appears to be valid and printing the full path of any file that appears not to be valid to stdout, one path per line. See the usage function for more info, or run 'python pdfchecker.py -h' to see the usage information. In particular, unless you follow the steps described there, you may see thousands of lines of error output from the Poppler library, which we have no way in this script of controlling. This script depends on the pypoppler library, which provides Python bindings to the Poppler library, which you also must have installed. See http://launchpad.net/poppler-python for getting and installing pypoppler. If you use Gentoo Linux, you can use the ebuild that is available at http://protempore.net/~calvins/code/#shell until an official ebuild is in Portage. """ # The following will fail unless http://launchpad.net/poppler-python # is installed. from poppler import document_new_from_file import os, sys, random from os.path import join, abspath, isdir, isfile def do_print(obj, end = '\n', out = sys.stdout, prefix = 'pdfchecker: '): """ A simple print function used here in order to avoid print and be compatible with both pre- and post-python3 versions of python. This function is simpler than the python3 print function, only accepting a single object arg (and no sep arg). If prefix is non-empty, it is written to the file before the main message (with no newline after it). It is used here to distinguish our error messages from Poppler messages (which can't be intercepted) in the stderr output, and defaults to 'pdfchecker: '.""" if prefix: out.write(prefix) out.write(str(obj)) if end: out.write(end) def normalize_path_as_uri(base_dir, filename, validate = False): """ Normalize two strings representing a directory (as either a relative or absolute path) and the name of a file in that directory, respectively, by converting them to a single absolute file:// URI. This is the format that Poppler requires filenames to be in. The third argument is a boolean value that determines whether to verify that the first two arguments represent an existing directory and an existing file in that directory (verify iff True). Both of these checks require an os call and a filesystem check. It is False by default, meaning that no such checks are performed; you'll still get an error message, but it won't be as friendly. """ assert isinstance(base_dir, basestring) and isinstance(filename, basestring) path = join(abspath(base_dir), filename) if validate: if not isdir(base_dir): raise ValueError("base_dir arg '%s' to normalize_path_as_uri is not a directory." % base_dir) if not isfile(path): raise ValueError("filename arg '%s' to normalize_path_as_uri is not a file in directory '%s'." % (filename, base_dir)) return 'file://' + path def print_bad_file_handler(filepath, exc, err_stream = sys.stderr, out_stream = sys.stdout): """ A default function for handling a bad PDF file by printing to out_stream the filepath and writing the message of the exception if non-None to the out_stream arg, both of which must be file objects that can be written to. """ if exc: msg = filepath + "\n" + str(exc) + "\n" + filepath do_print(msg, out = err_stream) # print filename to out_stream, making sure no prefix appears (since the output # needs to be usable by other scripts). do_print(filepath, out = out_stream, prefix = '') def check_pdf_file_rand_pages(file_uri, bad_file_handler = print_bad_file_handler): """ Check the file identified by the file URI by trying to open it, verifying that it has at least 1 page, and try to retrieve up to 3 random pages from the document, failing if any of these operations do not succeed and calling bad_file_handler with the file_uri and an exception as args. The bad_file_handler arg must be a callable that accepts 2 arguments, the first being a string representing the URI of the file, and the second being an exception that may have been thrown by this script (if the PDF file has no pages) or by Poppler (for any other failure). Note: this function does not handle PDF files that are password-protected. I'm not sure how Poppler treats them when no password is given, since I couldn't find a password-protected file to test on and don't have a means of creating such a file. Password-protected files may be treated as errors, so if you have some files that are password-protected, you should watch out for them in the output as invalid files and manually verify for yourself if they are valid or not. """ assert isinstance(file_uri, basestring) and callable(bad_file_handler) if not file_uri.lower().endswith('.pdf'): do_print("Skipping non-PDF file: %s" % file_uri, out = sys.stderr) return if not file_uri.startswith('file://'): raise ValueError("file_uri arg '%s' to check_pdf_file_rand_pages must be a file:// URI." % file_uri) try: doc = document_new_from_file(file_uri, None) # 2nd arg is password. n = doc.get_n_pages() # A valid PDF should have at least 1 page. if n < 1: raise Exception("Invalid PDF file '%s': file has %d pages." % (file_uri, n)) # Check 3 random pages; if no problems, consider it valid for i in range(0, min(n, 3)): rand_n = random.randint(0, n) page = doc.get_page(rand_n) except Exception, e: # If an exception was thrown, we consider it an invalid document, # and delegate handling the failure with the bad_file_handler. # We strip off the 'file://' beginning of the URI so that the user # doesn't have to do that before using the output with standard # Unix utilities that don't understand the 'file://' scheme. filepath = file_uri[7:] if file_uri.startswith('file://') else file_uri bad_file_handler(filepath, e) def check_files(root_dir, check_file = check_pdf_file_rand_pages, bad_file_handler = print_bad_file_handler, normalize_path = normalize_path_as_uri, err_stream = sys.stderr, verify_paths = False): """ Check all files in a directory, recursively, using the various supplied helper functions and helper objects, which are described below. The check_file arg must be callable and must accept two arguments. The first argument is a string representing an absolute path in the appropriate format (e.g., a 'file://' URI for Poppler). This function creates that string using the normalize_path callable arg, which is described below. The second arg to check_file is the bad_file_handler that is passed into this function. check_file is invoked once per file, for every file that is in root_dir or a sub-directory (recursively) of root_dir. The bad_file_handler arg must be callable and must accept at least two args. The callable object is called once for every file that fails to validate using the check_file function. It is passed a string representing the path or URI of the file that failed as the first arg, and an Exception (possibly None) that may have been thrown by Poppler or by the check_file function itself. The normalize_path arg must be callable and must accept two string arguments representing a directory (absolute or relative) and a filename in that directory, respectively. It is responsible for creating a file path or URI in whatever format is required by the check_file function. For example, the check_pdf_file_rand_pages function defined in this module requires the file to be given as a 'file://'-type URI, and the default value of normalize_path is normalize_path_as_uri, which creates such a URI. The err_stream must be a writable file object. Before checking each file, this function prints the file path to this stream; this is in order to be able to associate Poppler's error messages with the appriate file. The verify_paths argument that is a boolean value indicating whether to perform potentially slow verification that various paths represent real files and directories on the filesystem (this is just to fail sooner and with a nicer error message). It is passed through to normalize_path.""" for root, dirs, files in os.walk(root_dir): for d in dirs: check_files(join(root, d), check_file, bad_file_handler, normalize_path, err_stream, verify_paths) for f in files: norm_path = normalize_path(root, f, verify_paths) do_print(norm_path, out = sys.stderr) check_file(norm_path, bad_file_handler) def usage(prog_name, out = sys.stdout): do_print("""Usage: %s [-h|--help] ERR_FILE BASE_DIR [BASE_DIR]* ARGUMENTS ERR_FILE the path to a file that will be used for writing messages about the reason a PDF file was considered invalid. BASE_DIR a directory in which to recursively check all PDF files. -h | --help Display this help, then exit DESCRIPTION This script validates all PDF files recursively in one or more BASE_DIR directories, writing messages about invalid files to ERR_FILE. The underlying library that this script uses (Python bindings to Poppler) produces extremely voluminous error messages (hundreds of lines per file in many cases) directly to stderr, with no way of turning off this behavior, so you will almost certainly want to divert stderr output to /dev/null or to some other file after verifying that the script is running successfully to completion. You should save the output to a file before doing anything with it and manually santity-check the list of 'invalid' files, since I don't have the utmost faith in Poppler. Also, password-protected files may appear in the output as invalid files, since we try to open all files without a password. If you have some password-protected files and you see them in the output, make sure to manually check them. EXAMPLES Process all PDF files in a single directory, $HOME/papers, without diverting error messages. You may see hundreds of lines of error messages from Poppler, which we cannot intercept. At this point, we just want to make sure that the script isn't complaining about the arguments we've submitted and isn't failing unexpectedly. $ python pdfchecker.py $HOME/papers If the last line of output is something like 'pdfchecker: Completed successfully', then the script finished successfully, and we can safely run it again and divert the error messages, as with next command. Any message written to stderr by this script starts with 'pdfchecker: ', so you can distinguish between error output from this script versus error output from Poppler (which we can't intercept and which has no prefix (but generally begins with 'Error: ')). Most of the unwanted lines that Poppler outputs contain the string "Page transition object", so you can easily see the more important error messages in the err.out file using the following, which shows only lines that don't contain that string: $ grep -v "Page transition object" err.out Any error output there is associated with the last file printed before the error message beings. The following command process all PDF files in 2 directories, and error output is diverted to an err.out file in the current directory. The path of each file that was considered invalid is written to stdout, one file path per line. Since we are sending error output to errs.out, you will just see a list of files that are considered invalid. $ python pdfchecker.py $HOME/papers $HOME/manuals 2>> err.out Next, we will run the same command, but send the list of invalid files to a file named invalid.out. The following should be all on one line, and you won't see any output, since errors went to err.out, and invalid file paths went to invalid.out. $ python pdfchecker.py $HOME/papers $HOME/manuals 1>> invalid.out 2>> err.out You can do a random check of a few paths in invalid.out to verify that the file is actually invalid. If you have any passowrd-protected files, they may be listed as invalid even if they are valid. If so, remove those lines from the file before doing anything else. After verifying that there are no password-protected files in the invalid.out file and checking a few files to verify that they're broken, you might want to delete all of those invalid files. You can use a command like the following (for BASH) to delete the files listed in invalid.out: $ IFS=$'\\n'; for line in $(cat invalid.out); do rm "$line"; done The IFS is required only if some of your filenames contain whitespace. It forces the for loop to use only a newline as the separator. Or if you want to move them all somewhere else for dealing with later, you might do: $ mkdir invalid_pdf_files $ export IFS=$'\\n' $ for line in $(cat invalid.out); do mv "$line" invalid_pdf_files; done """ % prog_name, out = out) if __name__ == '__main__': try: # check if we are in iPython; if so, do nothing __IP except: prog_name = sys.argv[0] if len(sys.argv) > 1: # do nothing if no user-args given if sys.argv[1] == '-h' or sys.argv[1] == '--help': # show usage usage(prog_name) sys.exit(0) else: # interpret all args as dirs to be processed dirs = sys.argv[1:] # Verify that they're actually directories before starting for root_dir in dirs: if not os.path.isdir(root_dir): for out in [sys.stdout, sys.stderr]: # Print error to both stderr and stdout, because err # might be diverted, and we're not using stdout for # useful output. do_print("'%s' is not a directory. Try -h for usage for info." % root_dir, out = out) sys.exit(1) # Process each directory in the order they were given for root_dir in dirs: check_files(root_dir) # Print success message to stderr on completion, in order # to signal that it completed successfully (which is otherwise # not at all obvious given all the Poppler garbage output). do_print("Completed successfully", out = sys.stderr) sys.exit(0)