from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from caffe2.proto import caffe2_pb2
from caffe2.python import core, workspace
import onnx
import onnx.defs
from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model
from onnx.backend.base import namedtupledict
from caffe2.python.models.download import downloadFromURLToFile, getURLFromName, deleteDirectory
import caffe2.python.onnx.backend as c2
from caffe2.python.onnx.workspace import Workspace
from caffe2.python.trt.transform import convert_onnx_model_to_trt_op, transform_caffe2_net
from caffe2.python.onnx.tests.test_utils import TestCase
import numpy as np
import os.path
import json
import time
import unittest
import tarfile
import tempfile
import shutil
from six.moves.urllib.request import urlretrieve

def _print_net(net):
    for i in net.external_input:
        print("Input: {}".format(i))
    for i in net.external_output:
        print("Output: {}".format(i))
    for op in net.op:
        print("Op {}".format(op.type))
        for x in op.input:
            print("  input: {}".format(x))
        for y in op.output:
            print("  output: {}".format(y))


_BASE_URL = 'https://s3.amazonaws.com/download.onnx/models/opset_{}'.format(onnx.defs.onnx_opset_version())

# TODO: This is copied from https://github.com/onnx/onnx/blob/master/onnx/backend/test/runner/__init__.py. Maybe we should
# expose a model retrival API from ONNX
def _download_onnx_model(model_name):
    onnx_home = os.path.expanduser(os.getenv('ONNX_HOME', os.path.join('~', '.onnx')))
    models_dir = os.getenv('ONNX_MODELS',
                           os.path.join(onnx_home, 'models'))
    model_dir = os.path.join(models_dir, model_name)
    if not os.path.exists(os.path.join(model_dir, 'model.onnx')):
        if os.path.exists(model_dir):
            bi = 0
            while True:
                dest = '{}.old.{}'.format(model_dir, bi)
                if os.path.exists(dest):
                    bi += 1
                    continue
                shutil.move(model_dir, dest)
                break
        os.makedirs(model_dir)

        # On Windows, NamedTemporaryFile can not be opened for a
        # second time
        url = '{}/{}.tar.gz'.format(_BASE_URL, model_name)
        download_file = tempfile.NamedTemporaryFile(delete=False)
        try:
            download_file.close()
            print('Start downloading model {} from {}'.format(
                model_name, url))
            urlretrieve(url, download_file.name)
            print('Done')
            with tarfile.open(download_file.name) as t:
                t.extractall(models_dir)
        except Exception as e:
            print('Failed to prepare data for model {}: {}'.format(
                model_name, e))
            raise
        finally:
            os.remove(download_file.name)
    return model_dir

class TensorRTOpTest(TestCase):
    def _test_relu_graph(self, X, batch_size, trt_max_batch_size):
        node_def = make_node("Relu", ["X"], ["Y"])
        Y_c2 = c2.run_node(node_def, {"X": X})
        graph_def = make_graph(
            [node_def],
            name="test",
            inputs=[make_tensor_value_info("X", onnx.TensorProto.FLOAT, [batch_size, 1, 3, 2])],
            outputs=[make_tensor_value_info("Y", onnx.TensorProto.FLOAT, [batch_size, 1, 3, 2])])
        model_def = make_model(graph_def, producer_name='relu-test')
        op_outputs = [x.name for x in model_def.graph.output]
        op = convert_onnx_model_to_trt_op(model_def, max_batch_size=trt_max_batch_size)
        device_option = core.DeviceOption(caffe2_pb2.CUDA, 0)
        op.device_option.CopyFrom(device_option)
        Y_trt = None
        ws = Workspace()
        with core.DeviceScope(device_option):
            ws.FeedBlob("X", X)
            ws.RunOperatorsOnce([op])
            output_values = [ws.FetchBlob(name) for name in op_outputs]
            Y_trt = namedtupledict('Outputs', op_outputs)(*output_values)
        np.testing.assert_almost_equal(Y_c2, Y_trt)


    @unittest.skipIf('TEST_C2_TRT' not in os.environ, "No TensortRT support")
    def test_relu_graph_simple(self):
        X = np.random.randn(1, 1, 3, 2).astype(np.float32)
        self._test_relu_graph(X, 1, 50)


    @unittest.skipIf('TEST_C2_TRT' not in os.environ, "No TensortRT support")
    def test_relu_graph_big_batch(self):
        X = np.random.randn(52, 1, 3, 2).astype(np.float32)
        self._test_relu_graph(X, 52, 50)


    @unittest.skipIf('TEST_C2_TRT' not in os.environ, "No TensortRT support")
    def test_resnet50(self):
        input_blob_dims = (1, 3, 224, 224)
        model_dir = _download_onnx_model('resnet50')
        model_def = onnx.load(os.path.join(model_dir, 'model.onnx'))
        op_inputs = [x.name for x in model_def.graph.input]
        op_outputs = [x.name for x in model_def.graph.output]
        n, c, h, w = input_blob_dims
        data = np.random.randn(n, c, h, w).astype(np.float32)
        Y_c2 = c2.run_model(model_def, {op_inputs[0]: data})
        op = convert_onnx_model_to_trt_op(model_def)
        device_option = core.DeviceOption(caffe2_pb2.CUDA, 0)
        op.device_option.CopyFrom(device_option)
        Y_trt = None
        ws = Workspace()
        with core.DeviceScope(device_option):
            ws.FeedBlob(op_inputs[0], data)
            ws.RunOperatorsOnce([op])
            output_values = [ws.FetchBlob(name) for name in op_outputs]
            Y_trt = namedtupledict('Outputs', op_outputs)(*output_values)
        np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)


class TensorRTTransformTest(TestCase):
    def _model_dir(self, model):
        caffe2_home = os.path.expanduser(os.getenv('ONNX_HOME', '~/.caffe2'))
        models_dir = os.getenv('ONNX_MODELS', os.path.join(caffe2_home, 'models'))
        return os.path.join(models_dir, model)

    def _download(self, model):
        model_dir = self._model_dir(model)
        assert not os.path.exists(model_dir)
        os.makedirs(model_dir)
        for f in ['predict_net.pb', 'init_net.pb', 'value_info.json']:
            url = getURLFromName(model, f)
            dest = os.path.join(model_dir, f)
            try:
                try:
                    downloadFromURLToFile(url, dest,
                                          show_progress=False)
                except TypeError:
                    # show_progress not supported prior to
                    # Caffe2 78c014e752a374d905ecfb465d44fa16e02a28f1
                    # (Sep 17, 2017)
                    downloadFromURLToFile(url, dest)
            except Exception as e:
                print("Abort: {reason}".format(reason=e))
                print("Cleaning up...")
                deleteDirectory(model_dir)
                exit(1)

    def _get_c2_model(self, model_name):
        model_dir = self._model_dir(model_name)
        if not os.path.exists(model_dir):
            self._download(model_name)
        c2_predict_pb = os.path.join(model_dir, 'predict_net.pb')
        c2_predict_net = caffe2_pb2.NetDef()
        with open(c2_predict_pb, 'rb') as f:
            c2_predict_net.ParseFromString(f.read())
        c2_predict_net.name = model_name

        c2_init_pb = os.path.join(model_dir, 'init_net.pb')
        c2_init_net = caffe2_pb2.NetDef()
        with open(c2_init_pb, 'rb') as f:
            c2_init_net.ParseFromString(f.read())
        c2_init_net.name = model_name + '_init'

        value_info = json.load(open(os.path.join(model_dir, 'value_info.json')))
        return c2_init_net, c2_predict_net, value_info

    def _add_head_tail(self, pred_net, new_head, new_tail):
        orig_head = pred_net.external_input[0]
        orig_tail = pred_net.external_output[0]

        # Add head
        head = caffe2_pb2.OperatorDef()
        head.type = "Copy"
        head.input.append(new_head)
        head.output.append(orig_head)
        dummy = caffe2_pb2.NetDef()
        dummy.op.extend(pred_net.op)
        del pred_net.op[:]
        pred_net.op.extend([head])
        pred_net.op.extend(dummy.op)
        pred_net.external_input[0] = new_head

        # Add tail
        tail = caffe2_pb2.OperatorDef()
        tail.type = "Copy"
        tail.input.append(orig_tail)
        tail.output.append(new_tail)
        pred_net.op.extend([tail])
        pred_net.external_output[0] = new_tail


    @unittest.skipIf('TEST_C2_TRT' not in os.environ, "No TensortRT support")
    def test_resnet50_core(self):
        N = 2
        warmup = 20
        repeat = 100
        print("Batch size: {}, repeat inference {} times, warmup {} times".format(N, repeat, warmup))
        init_net, pred_net, _  = self._get_c2_model('resnet50')
        self._add_head_tail(pred_net, 'real_data', 'real_softmax')
        input_blob_dims = (N, 3, 224, 224)
        input_name = "real_data"

        device_option = core.DeviceOption(caffe2_pb2.CUDA, 0)
        init_net.device_option.CopyFrom(device_option)
        pred_net.device_option.CopyFrom(device_option)
        for op in pred_net.op:
            op.device_option.CopyFrom(device_option)
            op.engine = 'CUDNN'
        net_outputs = pred_net.external_output
        Y_c2 = None
        data =  np.random.randn(*input_blob_dims).astype(np.float32)
        c2_time = 1
        ws = Workspace()
        with core.DeviceScope(device_option):
            ws.FeedBlob(input_name, data)
            ws.RunNetOnce(init_net)
            ws.CreateNet(pred_net)
            for _ in range(warmup):
                ws.RunNet(pred_net.name)
            start = time.time()
            for _ in range(repeat):
                ws.RunNet(pred_net.name)
            end = time.time()
            c2_time = end - start
            output_values = [ws.FetchBlob(name) for name in net_outputs]
            Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values)
        ws.ResetWorkspace()

        # Cut the graph
        init_net_cut, pred_net_cut = transform_caffe2_net(init_net, pred_net, {input_name: input_blob_dims})
        del init_net, pred_net
        #print_net(pred_net_cut)

        Y_trt = None
        input_name = pred_net_cut.external_input[0]
        print("C2 runtime: {}s".format(c2_time))
        ws = Workspace()
        with core.DeviceScope(device_option):
            ws.FeedBlob(input_name, data)
            ws.RunNetOnce(init_net_cut)
            ws.CreateNet(pred_net_cut)
            for _ in range(warmup):
                ws.RunNet(pred_net_cut.name)
            start = time.time()
            for _ in range(repeat):
                ws.RunNet(pred_net_cut.name)
            end = time.time()
            trt_time = end - start
            print("TRT runtime: {}s, improvement: {}%".format(trt_time, (c2_time-trt_time)/c2_time*100))
            output_values = [ws.FetchBlob(name) for name in net_outputs]
            Y_trt = namedtupledict('Outputs', net_outputs)(*output_values)
        np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)