#!/bin/sh

set -e

if [ "$#" -lt 3 ]; then
    echo "Usage: ./libpostal_data [upload|download] [base|parser|language_classifier|all] data_dir"
    exit 1
fi

COMMAND=$1
FILE=$2
LIBPOSTAL_DATA_DIR=$3

MB=$((1024*1024))
CHUNK_SIZE=$((64*$MB))

DATAMODEL=""

# Not loving this approach but there appears to be no way to query the size
# of a release asset without using the Github API
LIBPOSTAL_DATA_FILE_CHUNKS=1
LIBPOSTAL_PARSER_MODEL_CHUNKS=12
LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1

LIBPOSTAL_DATA_DIR_VERSION_STRING="v1"

LIBPOSTAL_DATA_FILE_LATEST_VERSION="v1.0.0"
LIBPOSTAL_PARSER_MODEL_LATEST_VERSION="v1.0.0"
LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION="v1.0.0"

LIBPOSTAL_REPO_NAME="openvenues/libpostal"

LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"

LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download"

if [ "$DATAMODEL" = "senzing" ]; then
    LIBPOSTAL_DATA_FILE_CHUNKS=1
    LIBPOSTAL_PARSER_MODEL_CHUNKS=1
    LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1

    LIBPOSTAL_DATA_DIR_VERSION_STRING="v1"

    LIBPOSTAL_DATA_FILE_LATEST_VERSION="v1.0.0"
    LIBPOSTAL_PARSER_MODEL_LATEST_VERSION="v1.0.0"
    LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION="v1.0.0"

    LIBPOSTAL_BASE_URL="https://public-read-libpostal-data.s3.amazonaws.com"
fi

LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version
LIBPOSTAL_DATA_DIR_VERSION=

mkdir -p $LIBPOSTAL_DATA_DIR

LIBPOSTAL_DATA_FILE_VERSION_PATH=$LIBPOSTAL_DATA_DIR/base_data_file_version
LIBPOSTAL_PARSER_MODEL_VERSION_PATH=$LIBPOSTAL_DATA_DIR/parser_model_file_version
LIBPOSTAL_LANG_CLASS_MODEL_VERSION_PATH=$LIBPOSTAL_DATA_DIR/language_classifier_model_file_version

BASIC_MODULE_DIRS="address_expansions numex transliteration"
PARSER_MODULE_DIR=address_parser
LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier

NUM_WORKERS=12

kill_background_processes() {
    jobs -p | xargs kill;
    exit
}

trap kill_background_processes INT

PART_MSG='echo "Downloading part $1: filename=$5, offset=$2, max=$3"'
PART_CURL='curl -L $4 --silent -H"Range:bytes=$2-$3" --retry 3 --retry-delay 2 -o $5'
DOWNLOAD_PART="$PART_MSG;$PART_CURL"


download_release_multipart() {
    url=$1
    filename=$2
    num_chunks=$3

    echo "Downloading multipart: $url, num_chunks=$num_chunks"
    offset=0
    i=0
    while [ $i -lt $num_chunks ]; do
        i=$((i+1))
        part_filename="$filename.$i"
        max=$((offset+CHUNK_SIZE-1));
        printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename"
        offset=$((offset+CHUNK_SIZE))
    done | xargs -0 -n 5 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" --

    > $filename

    i=0
    while [ $i -lt $num_chunks ]; do
        i=$((i+1))
        part_filename="$filename.$i"
        cat $part_filename >> $filename
        rm $part_filename
    done;

}


download_release() {
    version_file_path=$1
    version=$2
    data_dir=$3
    num_chunks=$4
    filename=$5
    name=$6
    shift 6
    subdirs=$@

    local_path=$data_dir/$filename

    url=$LIBPOSTAL_BASE_URL/$version/$filename

    if [ ! -e $version_file_path ]; then
        current_version=""
    else
        current_version="$(cat $version_file_path)"

    fi;

    echo "Checking for new libpostal $name..."

    if [ "$current_version" != "$version" ]; then
        echo "New libpostal $name available"

        if [ $num_chunks -gt 1 ]; then
            download_release_multipart $url $local_path $num_chunks
        else
            curl -L $url --retry 3 --retry-delay 2 -o $local_path
        fi

        for subdir in $subdirs; do
            rm -rf $data_dir/$subdir;
        done
        tar -xvzf $local_path --no-same-owner -C $data_dir;
        rm $local_path;
        echo "$version" > $version_file_path;
    else
        echo "libpostal $name up to date"
    fi
}

if [ $COMMAND = "download" ]; then
    if [ -e $LIBPOSTAL_DATA_VERSION_FILE ]; then
        LIBPOSTAL_DATA_DIR_VERSION=$(cat $LIBPOSTAL_DATA_VERSION_FILE)

        if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_DATA_DIR_VERSION_STRING" ]; then
            echo "Old version of datadir detected, removing..."
            for subdir in $BASIC_MODULE_DIRS $PARSER_MODULE_DIR $LANGUAGE_CLASSIFIER_MODULE_DIR; do
                rm -rf $LIBPOSTAL_DATA_DIR/$subdir;
            done

            # Legacy, blow it away too to be nice
            if [ -e $LIBPOSTAL_DATA_DIR/geodb ]; then
                rm -rf $LIBPOSTAL_DATA_DIR/geodb;
            fi

            rm -f $LIBPOSTAL_DATA_DIR/last_updated*
            rm -f $LIBPOSTAL_DATA_DIR/*_version
        fi
    fi

    mkdir -p $LIBPOSTAL_DATA_DIR

    if ([ $FILE = "base" ] || [ $FILE = "all" ]); then
        download_release $LIBPOSTAL_DATA_FILE_VERSION_PATH $LIBPOSTAL_DATA_FILE_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE_CHUNKS $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS
    fi
    if ([ $FILE = "parser" ] || [ $FILE = "all" ]); then
        download_release $LIBPOSTAL_PARSER_MODEL_VERSION_PATH $LIBPOSTAL_PARSER_MODEL_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_PARSER_MODEL_CHUNKS $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR
    fi
    if ([ $FILE = "language_classifier" ] || [ $FILE = "all" ]); then
        download_release $LIBPOSTAL_LANG_CLASS_MODEL_VERSION_PATH $LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR
    fi

    echo "$LIBPOSTAL_DATA_DIR_VERSION_STRING" > $LIBPOSTAL_DATA_VERSION_FILE

else
    echo "Invalid command: $COMMAND"
    exit 1
fi
