How to use CutyCapt inside ArchiveBox

Use CutyCapt inside ArchiveBox to capture screenshot of the complete website.

Install CutyCapt utility and X virtual framebuffer.

$ sudo apt install cutycapt xvfb

Define configuration variables [archivebox/config.py file].

diff --git a/archivebox/config.py b/archivebox/config.py
index 47f1776..3c41340 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -49,6 +49,13 @@ CHROME_BINARY =          os.getenv('CHROME_BINARY',          None)
 
 URL_BLACKLIST =          os.getenv('URL_BLACKLIST',          None)
 
+
+CUTYCAPT_BINARY =        os.getenv('CUTYCAPT_BINARY',        'cutycapt')
+CUTYCAPT_USER_AGENT =    os.getenv('CUTYCAPT_USER_AGENT',    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36')
+CUTYCAPT_MIN_WIDTH =     os.getenv('CUTYCAPT_MIN_WIDTH',     '1280')
+CUTYCAPT_SMOOTH =        os.getenv('CUTYCAPT_SMOOTH',        'True').lower() == 'true'
+CUTYCAPT_DELAY =         os.getenv('CUTYCAPT_DELAY',         '2000')
+
 try:
     OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR'))
 except Exception:
@@ -159,6 +166,15 @@ try:
         'CHROME_USER_DATA_DIR': CHROME_USER_DATA_DIR,
     }
 
+    CUTYCAPT_OPTIONS = {
+        'TIMEOUT': TIMEOUT,
+        'CHECK_SSL_VALIDITY': CHECK_SSL_VALIDITY,
+        'CUTYCAPT_BINARY': CUTYCAPT_BINARY,
+        'CUTYCAPT_USER_AGENT': CUTYCAPT_USER_AGENT,
+        'CUTYCAPT_MIN_WIDTH': CUTYCAPT_MIN_WIDTH,
+        'CUTYCAPT_SMOOTH': CUTYCAPT_SMOOTH,
+        'CUTYCAPT_DELAY': CUTYCAPT_DELAY,
+    }
 
     ### Check Python environment
     python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))

Define helper function to build cutycapt shell command [archivebox/util.py file].

diff --git a/archivebox/util.py b/archivebox/util.py
index cec2303..763365c 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -30,6 +30,7 @@ from config import (
     CHECK_SSL_VALIDITY,
     WGET_USER_AGENT,
     CHROME_OPTIONS,
+    CUTYCAPT_OPTIONS,
 )
 from logs import pretty_path
 
@@ -568,3 +569,36 @@ def chrome_args(**options):
         cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
     
     return cmd_args
+
+def cutycapt_args(**options):
+    """helper to build up a cutycapt shell command with arguments"""
+
+    options = {**CUTYCAPT_OPTIONS, **options}
+
+    cmd_args = ['xvfb-run', options['CUTYCAPT_BINARY']]
+
+    if not options['CHECK_SSL_VALIDITY']:
+        cmd_args += ('--insecure',)
+
+    if options['CUTYCAPT_USER_AGENT']:
+        cmd_args += ('--user-agent={}'.format(options['CUTYCAPT_USER_AGENT']),)
+
+    if options['CUTYCAPT_MIN_WIDTH']:
+        cmd_args += ('--min-width={}'.format(options['CUTYCAPT_MIN_WIDTH']),)
+
+    if options['TIMEOUT']:
+        cmd_args += ('--max-wait={}'.format((options['TIMEOUT']) * 1000),)
+
+    if options['CUTYCAPT_SMOOTH']:
+        cmd_args += ('--smooth',)
+
+    if options['CUTYCAPT_DELAY']:
+        cmd_args.append('--delay={}'.format(options['CUTYCAPT_DELAY']),)
+
+    if options['URL']:
+        cmd_args += ('--url={}'.format(options['URL']),)
+
+    if options['OUT']:
+        cmd_args += ('--out={}'.format(options['OUT']),)
+
+    return cmd_args

Modify fetch_screenshot function to to use cutycapt [archivebox/archive_methods.py file].

diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py
index b2f04f3..299b9e6 100644
--- a/archivebox/archive_methods.py
+++ b/archivebox/archive_methods.py
@@ -32,9 +32,13 @@ from config import (
     GIT_SHA,
     WGET_USER_AGENT,
     CHECK_SSL_VALIDITY,
     COOKIES_FILE,
-    WGET_AUTO_COMPRESSION
+    WGET_AUTO_COMPRESSION,
+    CUTYCAPT_USER_AGENT,
+    CUTYCAPT_MIN_WIDTH,
+    CUTYCAPT_SMOOTH,
+    CUTYCAPT_DELAY
 )
 from util import (
     domain,
     extension,
@@ -45,8 +49,9 @@ from util import (
     TimedProgress,
     chmod_file,
     wget_output_path,
     chrome_args,
+    cutycapt_args,
     check_link_structure,
     run, PIPE, DEVNULL
 )
 from logs import (
@@ -338,13 +343,9 @@ def should_fetch_screenshot(link_dir, link):
 def fetch_screenshot(link_dir, link, timeout=TIMEOUT):
     """take screenshot of site using chrome --headless"""
 
     output = 'screenshot.png'
-    cmd = [
-        *chrome_args(TIMEOUT=timeout),
-        '--screenshot',
-        link['url'],
-    ]
+    cmd = cutycapt_args(TIMEOUT=50000,URL=link['url'],OUT=output)
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
     try:
         result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)

This will generate the following command.

xvfb-run cutycapt "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" --min-width=1280 --max-wait=50000000 --smooth --delay=2000 --url=https://digiday.com/media/wtf-link-rot/ --out=screenshot.png

I love this!