syntax = "proto2";

package caffe2;

// A few notes about the Caffe2's protobuffer convention:
// (1) Most objects are registered by their types, such as operators and nets.
//     For these, we have a string-type field "type" for registration purposes.
// (2) We do not use extension because that used to create quite some conflicts
//     in Caffe's protobuf design.
// (3) We have not used any proto3 specific features, such as Any or Map. This
//     is mainly for backward compability purposes but we may consider using
//     those in the future.

// TensorProto stores serialized Tensor objects.
message TensorProto {
  // The dimensions in the tensor.
  repeated int64 dims = 1;
  enum DataType {
    UNDEFINED = 0;
    FLOAT = 1;  // float
    INT32 = 2;  // int
    BYTE = 3;  // BYTE, when deserialized, is going to be restored as uint8.
    STRING = 4;  // string
    // Less-commonly used data types.
    BOOL = 5;  // bool
    UINT8 = 6;  // uint8_t
    INT8 = 7;  // int8_t
    UINT16 = 8;  // uint16_t
    INT16 = 9;  // int16_t
    INT64 = 10;  // int64_t
    FLOAT16 = 12;  // at::Half
    DOUBLE = 13;  // double
  }
  optional DataType data_type = 2 [default = FLOAT];
  // For float
  repeated float float_data = 3 [packed = true];
  // For int32, uint8, int8, uint16, int16, bool, and float16
  // Note about float16: in storage we will basically convert float16 byte-wise
  // to unsigned short and then store them in the int32_data field.
  repeated int32 int32_data = 4 [packed = true];
  // For bytes
  optional bytes byte_data = 5;
  // For strings
  repeated bytes string_data = 6;
  // For double
  repeated double double_data = 9 [packed = true];
  // For int64
  repeated int64 int64_data = 10 [packed = true];
  // Optionally, a name for the tensor.
  optional string name = 7;

  // Optionally, a TensorProto can contain the details about the device that
  // it was serialized from. This is useful in cases like snapshotting a whole
  // workspace in a multi-GPU environment.
  optional DeviceOption device_detail = 8;
  // When loading from chunks this is going to indicate where to put data in the
  // full array. When not used full data have to be present
  message Segment {
    required int64 begin = 1;
    required int64 end = 2;
  }
  optional Segment segment = 11;
}

message QTensorProto {
  repeated int64 dims = 1;
  required int32 precision = 2;
  required double scale = 3;
  required double bias = 4;
  required bool is_signed = 5;
  repeated int32 data = 6 [packed = true];
  optional string name = 7;
  optional TensorProto.DataType data_type = 8 [default = INT32];
}

// TensorProtos stores multiple TensorProto objects in one single proto. This
// is useful for small tensors; For anything big, consider using a DB for
// storage.
message TensorProtos {
  repeated TensorProto protos = 1;
}

message TensorShape {
  repeated int64 dims = 1;
  optional TensorProto.DataType data_type = 2 [default = FLOAT];
  repeated int32 unknown_dims = 3;
  optional bool unknown_shape = 4 [default = false];
  optional string name = 5;

}

message TensorShapes {
  repeated TensorShape shapes = 1;
}

// A named argument containing either singular float, integer and string
// values, or repeated float, int and string arrays.
message Argument {
  optional string name = 1;
  optional float f = 2;
  optional int64 i = 3;
  optional bytes s = 4;
  optional NetDef n = 8;
  repeated float floats = 5;
  repeated int64 ints = 6;
  repeated bytes strings = 7;
  repeated NetDef nets = 9;
}

// DeviceType that Caffe2 currently supports.
// Note: if you add a device type, make sure you add the corresponding device
// line in the DeviceTypeName() function in caffe2/utils/proto_utils.cc
// and update ATen/core/DeviceType.h
enum DeviceTypeProto {
  PROTO_CPU = 0;                    // In default, we will use CPU.
  PROTO_CUDA = 1;                   // CUDA.
  PROTO_MKLDNN = 2;                 // Reserved for explicit MKLDNN
  PROTO_OPENGL = 3;                 // OpenGL
  PROTO_OPENCL = 4;                 // OpenCL
  PROTO_IDEEP = 5;                  // IDEEP.
  PROTO_HIP = 6;                    // AMD HIP
  // Change the following number if you add more devices in the code.
  PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = 7;
  PROTO_ONLY_FOR_TEST = 20901701;   // This device type is only for test.
}

// Device-specific options. We do not distinguish DeviceOption protos for
// different DeviceTypes, so currently all devices share the same DeviceOption
// proto. Fields that are specific to a device type is ignored if the type does
// not match.
// Note: if you add fields to the DeviceOption, make sure you add the
// corresponding changes to IsSameDevice() function in utils/proto_utils.{h,cc}.
message DeviceOption {
  // [general] Options that need to be carried out before running the execution.
  // optional DeviceType device_type = 1 [ default = CPU ];
  optional int32 device_type = 1 [ default = 0 ]; // 0 is CPU.
  // [CUDA specific] the cuda gpu id.
  optional int32 device_id = 2;
  // [general] The random seed to start the device random number generator with.
  optional uint32 random_seed = 3;
  // [general] What node this op should execute on.
  // Used for net transformation purposes. Must be empty at execution time.
  optional string node_name = 4;
  // [CPU and Linux specific] NUMA node id
  optional int32 numa_node_id = 5;
  // [general] Extra information passed, not used at execution time currently.
  repeated string extra_info = 6;
  // [HIP specific] the hip gpu id.
  optional int32 hip_gpu_id = 7;
}

// Operator Definition.
message OperatorDef {
  repeated string input = 1; // the name of the input blobs
  repeated string output = 2; // the name of output top blobs
  optional string name = 3; // the operator name. This is optional.
  // the operator type. This is needed to create the object from the operator
  // registry.
  optional string type = 4;
  repeated Argument arg = 5;

  // The device option that the operator should run under.
  optional DeviceOption device_option = 6;

  // Optionally, one can specify an engine when there are multiple
  // implementations available simultaneously for one device type.
  // If one specifies an engine but that engine does not exist in the compiled
  // Caffe2 binary, Caffe2 will fall back to the default engine of that device
  // type.
  optional string engine = 7;


  // Additional 'fake' inputs used for expressing control dependencies
  // in the operator graph. This can be used to ensure that an
  // operator does not run until another operator is ready, for e.g.
  // scheduling control. These are not passed as actual inputs to the
  // Operator implementation, and are only used by the Net class for
  // scheduling purposes.
  repeated string control_input = 8;

  // is_gradient_op argument is only used as a hint in shape inference
  // and has no runtime significance
  optional bool is_gradient_op = 9 [default = false];

  // debug information associated with the construction of the operator.
  // This is an optional string with no assumed characteristics as
  // operators can be constructed in any language.
  optional string debug_info = 10;
}

// Network definition.
message NetDef {
  optional string name = 1; // the network's name
  // Operators that the network contains.
  // Note: this is not named "operator" because that is a reserved word in C++.
  repeated OperatorDef op = 2;

  // The type of network that the net should be run with. This routes the
  // network instantiation to different execution modes. The default mode,
  // "simple", runs the operators in a sequential way as the original Caffe
  // implementation does.
  optional string type = 3;

  // the number of workers, if the operators in the network is to be carried out
  // in parallel.
  // Note: This is to be deprecated. Using the arg field with "num_workers" as
  // key.
  optional int32 num_workers = 4 [deprecated=true];

  // The device option for the network. If a network has a specific device
  // option and one of its operators does not have it set, we will copy over the
  // device option to the operator. This allows us to basically avoid putting
  // device options at every operator.
  optional DeviceOption device_option = 5;

  repeated Argument arg = 6;

  // Two optional fields to declare external input and output of a net.
  // If these two are set, when a net is created, we will sanity check for
  // every op whether its input is declared (either as an external input,
  // or as an intermediate blob created by one of the ops), and sanity check
  // if all blobs in external_output are produced.
  //
  // In cases of memory optimization, declaring external_input and
  // external_output also ensures that storage of these blobs are persistent:
  // for any blob in external_input and external_output, after a network run
  // finishes, their content are actually the right content. Any intermediate
  // blobs' contents may be overwritten.
  repeated string external_input = 7;
  repeated string external_output = 8;
}

// ExecutionStep is actually a sort-of-hacky way we simulate iteration right
// now.
message ExecutionStep {
  // ExecutionStep should either contain a set of substeps, or a set of
  // network names to run in this execution step. They should NOT both be set
  // at the same time.
  optional string name = 1;
  // An execution step could be recursive, in which it involves a set of
  // substeps.
  repeated ExecutionStep substep = 2;
  // Alternatively, an execution step could involve one or more networks.
  // Note that you cannot have both substeps and networks. Choose one.
  // Note that an execution step refers networks by their name. The actual
  // network definition of the same name should be included in the network field
  // of the plan. The reason is that a network object might hold internal states
  // (think of a data layer), so we want to have the same network object that
  // multiple steps could ask to run.
  repeated string network = 3;
  // Number of iterations to run this step. The substeps or the networks
  // specified will be run sequentially, and one sequential run is considered
  // one iteration. If this is not set, the number of iterations is assumed to
  // be 1.
  optional int64 num_iter = 4;

  // Criteria network specifies a single output (TensorCPU<bool>) of
  // size (1), is run on every iteration by the executor, and
  // execution terminates when the output[0] is `false`.
  optional string criteria_network = 5 [deprecated=true];

  // DEPRECATED. Use `run_every_ms`.
  optional string report_net = 7;
  optional int32 report_interval = 8;

  // If provided, execute this step at every time interval (in millisecs)
  // while its sibiling execution steps execute in parallel. This step is
  // guaranteed to run at least once after all non-interval siblings finished.
  optional int64 run_every_ms = 11;

  // If false or not set, execute sub-steps serially.
  // If true, execute all substeps concurrently, each one in a separte thread.
  optional bool concurrent_substeps = 6;

  // Name of a scalar boolean tensor.
  // ES checks this blob AFTER every substeps/subnets.
  // If specified, and the value is true, then ES will skip the rest and return
  // immediately.
  // This means that the report_net and the first step will always be called.
  // Use cases:
  // 1) the first substep stops the rest if data condition not met
  // 2) the first substep decide which of the rest of the steps should be run.
  // 3) external control
  //
  // ** It is the user's responsibility to not to put this blob in race conditions.
  // ** For example when setting this blob in concurrent substeps
  optional string should_stop_blob = 9;

  // if only_once is true, this step will only be executed once. this ONLY takes
  // effect when using should_stop_blob
  optional bool only_once = 10;

  // Whether to create a child workspace for this step.
  // If yes, the workflow and nets are re-created every time this step is run.
  optional bool create_workspace = 12;

  // How many copies of the children execution steps to run concurrently.
  optional int32 num_concurrent_instances = 13;
}

message PlanDef {
  // All the networks that are used in this execution. Note that networks should
  // be ordered in the way they are executed, i.e. for a layer in a network, all
  // its input blobs should already have been initialized by the layers or
  // networks defined before it.
  optional string name = 1;
  // The networks that are going to be used in this plan.
  repeated NetDef network = 2;
  repeated ExecutionStep execution_step = 3;
}

// Protobuf format for blobs that are not Tensors. We use a key to store the
// type of the blob. For example for a serialized DBProto, the type should
// be "DBReader" and the content should be a serialized DBProto object.
message BlobProto {
  optional string name = 1;
  optional string type = 2;
  optional TensorProto tensor = 3;
  optional bytes content = 4;
  optional QTensorProto qtensor = 5;
  // If blob is not Tensor and is divided into chunks, content_num_chunks
  // contains number of chunks, into which blob was divided.
  optional int32 content_num_chunks = 6;
  optional int32 content_chunk_id = 7;
}

// Protobuf format to serialize DBReader.
message DBReaderProto {
  // The name for the DB object in the workspace.
  optional string name = 1;
  // The source of the DB
  optional string source = 2;
  // The type of the DB
  optional string db_type = 3;
  // The current key of the DB if the DB supports seeking.
  optional string key = 4;
}