diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..24f9f8d --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.env +data/ diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..f45ee0b --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,79 @@ +version: "3.8" +services: + archivebox: + image: archivebox/archivebox:master + command: server --quick-init 0.0.0.0:8000 + ports: + - 127.0.0.1:90:8000 + environment: + - ALLOWED_HOSTS=* + - MEDIA_MAX_SIZE=750m + - SEARCH_BACKEND_ENGINE=sonic # uncomment these if you enable sonic below + - SEARCH_BACKEND_HOST_NAME=sonic + - SEARCH_BACKEND_PASSWORD + volumes: + - ./data:/data + # - ./archivebox:/app/archivebox # for developers working on archivebox + + # To run the Sonic full-text search backend, first download the config file to sonic.cfg + # curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic.cfg + # after starting, backfill any existing Snapshots into the index: docker-compose run archivebox update --index-only + sonic: + image: valeriansaliou/sonic:v1.3.0 + # expose: + # - 1491 + environment: + - SEARCH_BACKEND_PASSWORD + volumes: + - ./sonic.cfg:/etc/sonic.cfg:ro + - ./data/sonic:/var/lib/sonic/store + + + ### Optional Addons: tweak these examples as needed for your specific use case + + # Example: Run scheduled imports in a docker instead of using cron on the + # host machine, add tasks and see more info with archivebox schedule --help + # scheduler: + # image: archivebox/archivebox:latest + # command: schedule --foreground --every=day --depth=1 'https://getpocket.com/users/USERNAME/feed/all' + # environment: + # - USE_COLOR=True + # - SHOW_PROGRESS=False + # volumes: + # - ./data:/data + + # Example: Put Nginx in front of the ArchiveBox server for SSL termination + # nginx: + # image: nginx:alpine + # ports: + # - 443:443 + # - 80:80 + # volumes: + # - ./etc/nginx/nginx.conf:/etc/nginx/nginx.conf + # - ./data:/var/www + + # Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel + # wireguard: + # image: linuxserver/wireguard + # network_mode: 'service:archivebox' + # cap_add: + # - NET_ADMIN + # - SYS_MODULE + # sysctls: + # - net.ipv4.conf.all.rp_filter=2 + # - net.ipv4.conf.all.src_valid_mark=1 + # volumes: + # - /lib/modules:/lib/modules + # - ./wireguard.conf:/config/wg0.conf:ro + + # Example: Run PYWB in parallel and auto-import WARCs from ArchiveBox + # pywb: + # image: webrecorder/pywb:latest + # entrypoint: /bin/sh 'wb-manager add default /archivebox/archive/*/warc/*.warc.gz; wayback --proxy;' + # environment: + # - INIT_COLLECTION=archivebox + # ports: + # - 8080:8080 + # volumes: + # ./data:/archivebox + # ./data/wayback:/webarchive diff --git a/sonic.cfg b/sonic.cfg new file mode 100644 index 0000000..10d94ea --- /dev/null +++ b/sonic.cfg @@ -0,0 +1,66 @@ +# Sonic +# Fast, lightweight and schema-less search backend +# Configuration file +# Example: https://github.com/valeriansaliou/sonic/blob/master/config.cfg + + +[server] + +log_level = "warn" + + +[channel] + +inet = "0.0.0.0:1491" +tcp_timeout = 300 + +auth_password = "${env.SEARCH_BACKEND_PASSWORD}" + +[channel.search] + +query_limit_default = 65535 +query_limit_maximum = 65535 +query_alternates_try = 10 + +suggest_limit_default = 5 +suggest_limit_maximum = 20 + + +[store] + +[store.kv] + +path = "/var/lib/sonic/store/kv/" + +retain_word_objects = 100000 + +[store.kv.pool] + +inactive_after = 1800 + +[store.kv.database] + +flush_after = 900 + +compress = true +parallelism = 2 +max_files = 100 +max_compactions = 1 +max_flushes = 1 +write_buffer = 16384 +write_ahead_log = true + +[store.fst] + +path = "/var/lib/sonic/store/fst/" + +[store.fst.pool] + +inactive_after = 300 + +[store.fst.graph] + +consolidate_after = 180 + +max_size = 2048 +max_words = 250000