blob: 23241258c54325f8dc67d7e76fbabcb39e21922f [file] [log] [blame]
#!/usr/bin/env python3
# Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
# for details. All rights reserved. Use of this source code is governed by a
# BSD-style license that can be found in the LICENSE file.
'''
This script finds all HTML pages in a folder and downloads all images, replacing
the urls with local ones.
'''
import os, sys, optparse, subprocess, multiprocessing
from os.path import abspath, basename, dirname, join
SWARM_PATH = dirname(abspath(__file__))
CLIENT_PATH = dirname(dirname(SWARM_PATH))
CLIENT_TOOLS_PATH = join(CLIENT_PATH, 'tools')
# Add the client tools directory so we can find htmlconverter.py.
sys.path.append(CLIENT_TOOLS_PATH)
import htmlconverter
converter = CLIENT_TOOLS_PATH + '/htmlconverter.py'
# This has to be a top level function to use with multiprocessing
def convertImgs(infile):
global options
try:
htmlconverter.convertForOffline(
infile,
infile,
verbose=options.verbose,
encode_images=options.inline_images)
print('Converted ' + infile)
except BaseException as e:
print('Caught error: %s' % e)
def Flags():
""" Constructs a parser for extracting flags from the command line. """
parser = optparse.OptionParser()
parser.add_option(
"--inline_images",
help=("Encode img payloads as data:// URLs rather than local files."),
default=False,
action='store_true')
parser.add_option(
"--verbose",
help="Print verbose output",
default=False,
action="store_true")
return parser
def main():
global options
parser = Flags()
options, args = parser.parse_args()
print("args: %s" % args)
if len(args) < 1 or 'help' in args[0]:
print('Usage: %s DIRECTORY' % basename(sys.argv[0]))
return 1
dirname = args[0]
print('Searching directory ' + dirname)
files = []
for root, dirs, fnames in os.walk(dirname):
for fname in fnames:
if fname.endswith('.html'):
files.append(join(root, fname))
count = 4 * multiprocessing.cpu_count()
pool = multiprocessing.Pool(processes=count)
# Note: need a timeout to get keyboard interrupt due to a Python bug
pool.map_async(convertImgs, files).get(3600) # one hour
if __name__ == '__main__':
main()