Source code for tortik.page

# -*- coding: utf-8 -*-

import os
import sys
import time
import traceback
import hashlib
import random
from itertools import count
from functools import partial
from copy import copy
import json

try:
    import urlparse  # py2
except ImportError:
    import urllib.parse as urlparse  # py3

import tornado.web
import tornado.httpclient
from tornado.options import options, define
from tornado.escape import to_unicode
import tornado.gen
from jinja2 import Environment, PackageLoader
import six

from tortik.util import decorate_all, make_list, real_ip, make_qs
from tortik.util.dumper import dump
from tortik.util.xml_etree import tostring
from tortik.logger import PageLogger
from tortik.util.async import AsyncGroup
from tortik.util.parse import parse_xml, parse_json


define('debug_password', default=None, type=str, help='Password for debug')
define('debug', default=True, type=bool, help='Debug mode')
define('tortik_max_clients', default=200, type=int, help='Max clients (requests) for http_client')
define('tortik_timeout_multiplier', default=1.0, type=float, help='Timeout multiplier (affects all requests)')

_DEBUG_ALL = "all"
_DEBUG_ONLY_ERRORS = "only_errors"
_DEBUG_NONE = "none"

stats = count()


def _gen_requestid():
    return hashlib.md5('{}{}{}'.format(os.getpid(), next(stats), random.random()).encode('utf-8')).hexdigest()


_decorates = decorate_all([
    (tornado.web.asynchronous, 'asynchronous'),  # should be the last
])


@six.add_metaclass(_decorates)
[docs]class RequestHandler(tornado.web.RequestHandler):
    """Base handler for request handle

    Differs from ``tornado.web.RequestHandler`` that all method handlers are ``@tornado.web.asynchronous`` by default.

    Handler completion should be done with ``self.complete`` method
    instead of ``self.finish`` for applying postprocessors.
     """

    def initialize(self, *args, **kwargs):
        debug_pass = options.debug_password
        debug_agrs = self.get_arguments('debug')
        debug_arg_set = (len(debug_agrs) > 0 and debug_pass is not None and
                         (debug_pass == '' or debug_pass == debug_agrs[-1]))

        if debug_arg_set:
            self.debug_type = _DEBUG_ALL
        elif options.debug:
            self.debug_type = _DEBUG_ONLY_ERRORS
        else:
            self.debug_type = _DEBUG_NONE

        if self.debug_type != _DEBUG_NONE and not hasattr(RequestHandler, 'debug_loader'):
            environment = Environment(autoescape=True,
                                      loader=PackageLoader('tortik', 'templates'),
                                      extensions=['jinja2.ext.autoescape'],
                                      auto_reload=options.debug)

            environment.filters['split'] = lambda x, y: x.split(y)
            RequestHandler.debug_loader = environment

        self.error_detected = False

        self.request_id = self.request.headers.get('X-Request-Id', _gen_requestid())

        self.log = PageLogger(self.request, self.request_id, (self.debug_type != _DEBUG_NONE),
                              handler_name=(type(self).__module__ + '.' + type(self).__name__))

        self.responses = {}
        self.http_client = self.initialize_http_client()

        self.preprocessors = copy(self.preprocessors) if hasattr(self, 'preprocessors') else []
        self.postprocessors = copy(self.postprocessors) if hasattr(self, 'postprocessors') else []

        self.log.info('Using http client: %s' % repr(self.http_client))

        self._extra_data = {}

    @tornado.gen.coroutine
    def prepare(self):
        if self.preprocessors:
            start_time = time.time()
            yield list(map(lambda x: tornado.gen.Task(x, self), self.preprocessors))
            self.log.debug("Preprocessors completed in %.2fms", (time.time() - start_time)*1000.)

    @staticmethod
    def get_global_http_client():
        if not hasattr(RequestHandler, '_http_client'):
            RequestHandler._http_client = tornado.httpclient.AsyncHTTPClient(
                max_clients=options.tortik_max_clients)

        return RequestHandler._http_client

    def initialize_http_client(self):
        return self.get_global_http_client()

    def add(self, name, data):
        self._extra_data[name] = data

    def get_data(self):
        return self._extra_data

    def compute_etag(self):
        return None

    def on_finish(self):
        self.log.complete_logging(self.get_status())

    def write_error(self, status_code, **kwargs):
        if self.debug_type in [_DEBUG_ALL, _DEBUG_ONLY_ERRORS]:
            if 'exc_info' in kwargs:
                type, value, tb = kwargs['exc_info']
                self.log.error("Uncaught exception %s\n%r", self._request_summary(),
                               self.request, exc_info=(type, value, tb))

            if self._finished:
                return

            self.set_status(status_code)
            self.log.complete_logging(status_code)
            self.finish_with_debug()

            return True
        else:
            super(RequestHandler, self).write_error(status_code, **kwargs)

    def finish_with_debug(self):
        self.set_header('Content-Type', 'text/html; charset=utf-8')
        if self.debug_type == _DEBUG_ALL:
            self.set_status(200)

        self.finish(RequestHandler.debug_loader.get_template('debug.html').render(
            data=self.log.get_debug_info(),
            output_data=self.get_data(),
            size=sys.getsizeof,
            get_params=lambda x: urlparse.parse_qs(x, keep_blank_values=True),
            pretty_json=lambda x: json.dumps(x, sort_keys=True, indent=4, ensure_ascii=False),
            pretty_xml=lambda x: to_unicode(tostring(x.getroot() if hasattr(x, 'getroot') else x,
                                                     pretty_print=True, encoding='UTF-8')),
            to_unicode=to_unicode,
            dumper=dump,
            format_exception=lambda x: "".join(traceback.format_exception(*x))
        ))

    def complete(self, output_data=None):
        def finished_cb(handler, data):
            handler.log.complete_logging(handler.get_status())
            if handler.debug_type == _DEBUG_ALL:
                self.finish_with_debug()
                return

            self.finish(data)

        if self.postprocessors:
            last = len(self.postprocessors) - 1

            def add_cb(index):
                if index == last:
                    return finished_cb
                else:
                    def _cb(handler, data):
                        self.postprocessors[index + 1](handler, data, add_cb(index + 1))
                    return _cb

            self.postprocessors[0](self, output_data, add_cb(0))
        else:
            finished_cb(self, output_data)

    def fetch_requests(self, requests, callback=None, stage='page'):
        self.log.stage_started(stage)
        requests = make_list(requests)

        def _finish_cb():
            self.log.stage_complete(stage)
            if callback is not None:
                callback()

        ag = AsyncGroup(_finish_cb, self.log.debug, name=stage)

        def _on_fetch(response, name):
            content_type = response.headers.get('Content-Type', '').split(';')[0]
            response.data = None
            try:
                if 'xml' in content_type:
                    response.data = parse_xml(response)
                elif content_type == 'application/json':
                    response.data = parse_json(response)
            except:
                self.log.warning('Could not parse response with Content-Type header')

            if response.data is not None:
                self.add(name, response.data)

            self.responses[name] = response
            self.log.request_complete(response)

        for req in requests:
            if isinstance(req, (tuple, list)):
                assert len(req) in (2, 3)
                req = self.make_request(name=req[0], method='GET', full_url=req[1],
                                        data=req[2] if len(req) == 3 else '')
            self.log.request_started(req)
            self.http_client.fetch(req, ag.add(partial(_on_fetch, name=req.name)))

[docs]    def make_request(self, name, method='GET', full_url=None, url_prefix=None, path='', data='', headers=None,
                     connect_timeout=1, request_timeout=2, follow_redirects=True, **kwargs):
        """
        Class for easier constructing ``tornado.httpclient.HTTPRequest`` object.

        Request url could be constructed with two ways:

            * ``full_url`` argument
            * ``url_prefix`` as domain part and ``path`` as path part

        :param string name: Name of the request (for later accessing response through ``self.responses.get(name)``)
        :param string method: HTTP method, e.g. "GET" or "POST"
        :param string full_url: Full url for the requesting server (ex. ``http://example.com``)
        :param string url_prefix: Request url domain part
        :param string path: Request url path part
        :param data: Query to be passed to the request (could be a dict and would be translated to a query string)
        :type data: `string` or `dict`
        :param headers: Additional HTTP headers to pass on the request
        :type headers: ``tornado.httputil.HTTPHeaders`` or `dict`
        :param float connect_timeout: Timeout for initial connection in seconds
        :param float request_timeout: Timeout for entire request in seconds
        :param bool follow_redirects: Should redirects be followed automatically or return the 3xx response?
        :param kwargs: any other ``tornado.httpclient.HTTPRequest`` arguments
        :return: ``tornado.httpclient.HTTPRequest``
        """

        if (full_url is None) == (url_prefix is None):
            raise TypeError('make_request required path/url_prefix arguments pair or full_url argument')
        if full_url is not None and path != '':
            raise TypeError("Can't combine full_url and path arguments")

        scheme = 'http'
        query = ''
        body = None

        if full_url is not None:
            parsed_full_url = urlparse.urlsplit(full_url)
            scheme = parsed_full_url.scheme
            url_prefix = parsed_full_url.netloc
            path = parsed_full_url.path
            query = parsed_full_url.query

        if method in ['GET', 'HEAD', 'DELETE']:
            parsed_query = urlparse.parse_qs(query)
            parsed_query.update(data if isinstance(data, dict) else urlparse.parse_qs(data))
            query = make_qs(parsed_query)
        else:
            body = make_qs(data) if isinstance(data, dict) else data

        headers = {} if headers is None else headers

        headers.update({
            'X-Forwarded-For': real_ip(self.request),
            'X-Request-Id': self.request_id,
            'Content-Type': headers.get('Content-Type', 'application/x-www-form-urlencoded')
        })

        req = tornado.httpclient.HTTPRequest(
            url=urlparse.urlunsplit((scheme, url_prefix, path, query, '')),
            method=method,
            headers=headers,
            body=body,
            connect_timeout=connect_timeout*options.tortik_timeout_multiplier,
            request_timeout=request_timeout*options.tortik_timeout_multiplier,
            follow_redirects=follow_redirects,
            **kwargs
        )
        req.name = name
        return req

    def add_preprocessor(self, preprocessor):
        self.preprocessors.append(preprocessor)

    def add_postprocessor(self, postprocessor):
        self.postprocessors.append(postprocessor)