How (not) to build publicly available NLP web services

Material:

Converting command line tools to services

TreeTaggerWrapper

  • File: /mnt/storage/clfiles/resources/lib/python2.7/dist-packages/treetaggerwrapper.py
            # Send text to TreeTagger, get result.
            logger.debug("Tagging text.")
            t = threading.Thread(target=pipe_writer,
                                 args=(self.taginput,
                                       lines, self.dummysequence,
                                       self.taginencoding,
                                       self.taginencerr))
            t.start()
 
            result = []
            intext = False
            lastline_time = time.time()
            while True:
                line = self.tagoutput.readline()
                if DEBUG: logger.debug("Read from TreeTagger: %r", line)
                if not line:
                    if (time.time() - lastline_time) > TAGGER_TIMEOUT:
                        # We already wait some times, there may be a problem with tagging
                        # process communication. This avoid infinite loop.
                        logger.error("Time out for TreeTagger reply.")
                        raise TreeTaggerError("Time out for TreeTagger reply, enable debug / see error logs")
                    else:
                        # We process too much quickly, leave time for tagger and writer
                        # thread to work.
                        time.sleep(0.1)
                        continue    # read again.
                lastline_time = time.time()
 
                line = line.decode(self.tagoutencoding, self.tagoutencerr)
                line = line.strip()
                if line == STARTOFTEXT:
                    intext = True
                    continue
                if line == ENDOFTEXT:  # The flag we sent to identify texts.
                    intext = False
                    break
                if intext and line:
                    if not (self.removesgml and is_sgml_tag(line)):
                        result.append(line)
def pipe_writer(pipe, text, flushsequence, encoding, errors):
    """Write a text to a pipe and manage pre-post data to ensure flushing.
 
    For internal use.
 
    If text is composed of str strings, they are written as-is (ie. assume
    ad-hoc encoding is providen by caller). If it is composed of unicode
    strings, then they are converted to the specified encoding.
 
    :param  pipe: the Popen pipe on what to write the text.
    :type   pipe: Popen object (file-like with write and flush methods)
    :param  text: the text to write.
    :type   text: string or list of strings
    :param  flushsequence: lines of tokens to ensure flush by TreeTagger.
    :type   flushsequence: string (with \\n between tokens)
    :param  encoding: encoding of texts written on the pipe.
    :type   encoding: str
    :param  errors: how to manage encoding errors: strict/ignore/replace.
    :type  errors: str
    """
    "de": {
        "encoding": "utf-8",
        "tagparfile": "german-utf8.par",
        "abbrevfile": "german-abbreviations-utf8",
        "pchar": ALONEMARKS + "'",
        "fchar": ALONEMARKS + "'",
        "pclictic": "",
        "fclictic": "'(s|re|ve|d|m|em|ll)|n't",
        "number": NUMBER_EXPRESSION,
        "dummysentence": "Das ist ein Testsatz um das Stossen der "
                         "daten sicherzustellen .",
        "replurlexp": 'replaced-url',
        "replemailexp": 'replaced-email',
        "replipexp": 'replaced-ip',
        "repldnsexp": 'replaced-dns'
    },

Connection limit in Nginx config

"limit_conn_zone" directive

limit_conn_zone $binary_remote_addr zone=addr:10m;
location /demo/parzu/ {
    limit_conn addr 1;
    rewrite /demo/parzu/(.*)$ /$1 break;
    proxy_pass http://dutchy.cli/clfiles/projects/cl/webapp/parzu/$1$is_args$args;
}

location /demo/corzu/ {
    limit_conn addr 1;
    rewrite /demo/corzu/(.*)$ /$1 break;
    proxy_pass http://dutchy.cli/harlie/projects/clcoref/corzu_web_demo/$1$is_args$args;
}

Self-locking application

ParZu

# Don spam/DDOS prevention
# Check if last log entry is older than X seconds. If not, abort.
if os.path.isfile(logfile):
    time_since_last_call = time.time() - os.stat(logfile).st_mtime
    if time_since_last_call < 5:
        additional_styles = "\nfont-family: Arial, Helvetica, sans-serif;\nfont-size: 12px;\n"
        print(html_text.format(additional_styles,'Demo already running. Wait 20 seconds and try again.' ))
        sys.exit()

Try it out:

curl --data "output=conll&rawtext=Mein Luftkissenfahrzeug ist voller Aale." \
  https://pub.cl.uzh.ch/demo/parzu/parzu.cgi

CorZu

# Spam / DDOS prevention: Check if the parsed file is at least 30 secs old before starting anew
if [[ -f $tmp_dir/parsed.conll ]]
then
    lastchange=$(($(date +%s) - $(date +%s -r $tmp_dir/parsed.conll)))
    if [[ "$lastchange" -lt 5 ]]
        then
        echo Content-type: text/html
        echo ""
        echo "Demo already running. Please wait 5 seconds and try again."
        exit
    fi
fi

Try it out:

curl --form "format=conll" --form "text=Mein Luftkissenfahrzeug ist voller Aale. Sie sind überall." \
  https://pub.cl.uzh.ch/demo/corzu/CorZu.cgi

Pipelines

echo "Mein Luftkissenfahrzeug ist voller Aale. Sie sind überall." \
  | maltparser-tokenizer-treetagger-german.bash
echo "À cheval donné on ne regarde pas les dents." \
  | maltparser-tokenizer-MElt-french.bash
echo "Det är viktigt att du aktiverar ditt studentkonto på Studentportalen." \
  | maltparser-stagger-swedish.bash

Service

 echo "Mein Luftkissenfahrzeug ist voller Aale." \
   | curl --data @- pub.cl.uzh.ch/service/nlpservice/parzu

CL Wiki

Institute of Computational Linguistics – University of Zurich