Cache device on TensorImpl; clean up TensorImpl constructors. (#18833)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18833 ghimport-source-id: 6f2be25fcc5e6be3ffe20582e604bd2c1fbab66b Stack from [ghstack](https://github.com/ezyang/ghstack): * **#18833 [STACK] Cache device on TensorImpl; clean up TensorImpl constructors.** * #18832 [STACK] Disallow changing the device of a tensor via set_. * #18831 [STACK] Stop swapping in Storages of the wrong device for Tensors. 1) We cache device on TensorImpl. This means we can access the device without a virtual function and allows us to more easily extend TensorImpls (because they don't need to figure out how to store the Device for themselves). 2) Clean up TensorImpl APIs. We had a constructor that took a TensorTypeId and an allocator and would allocate a Storage based on the recognized types of TensorTypeIds. Instead, we just have two different constructors: one for types with a storage, one without. Reviewed By: dzhulgakov Differential Revision: D14766230 fbshipit-source-id: 745b8db84dcd6cb58f1a8675ad3ff8d033bc50df
author: Gregory Chanan <gchanan@fb.com> 2019-04-05 07:18:38 -0700
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-04-05 07:21:39 -0700
commit: 043e363c6c2bea3652ace954e4e70707cad0cf5e (patch)
tree: 556dfe2af313f2b292177105a10642924a8e0277 /c10
parent: b7c830b916879a69a5ef361b3cbdc73cd833f92d (diff)
download: pytorch-043e363c6c2bea3652ace954e4e70707cad0cf5e.tar.gz
pytorch-043e363c6c2bea3652ace954e4e70707cad0cf5e.tar.bz2
pytorch-043e363c6c2bea3652ace954e4e70707cad0cf5e.zip
3 files changed, 60 insertions, 51 deletions
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index 446d206c24..258d402a90 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -33,26 +33,26 @@ const at::Tensor& TensorImpl::grad() const {
   }
 }
 
-TensorImpl::TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable)
-    : TensorImpl({}, type_id, data_type, is_variable) {
-  // Variables, UndefinedTensors and SparseTensors don't have storages.
-  if (!is_variable && type_id != UndefinedTensorId() && data_type.id() != caffe2::TypeIdentifier::uninitialized()
-      && type_id != SparseCPUTensorId() && type_id != SparseCUDATensorId()) {
-    storage_ = Storage(data_type, 0, allocator, true);
-  }
-}
-
 TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable)
-    : TensorImpl(std::move(storage), type_id, storage.dtype(), is_variable) {}
+    : TensorImpl(std::move(storage), type_id, storage.dtype(), storage.device(), is_variable) {}
+
+TensorImpl::TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, c10::optional<c10::Device> device_opt, bool is_variable)
+    : TensorImpl({}, type_id, data_type, std::move(device_opt), is_variable) {}
 
-TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, bool is_variable)
+TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type,
+                       c10::optional<c10::Device> device_opt, bool is_variable)
     : storage_(std::move(storage)),
       sizes_{0},
       storage_offset_(0),
       numel_(0),
       data_type_(data_type),
+      device_opt_(device_opt),
       type_id_(type_id),
       is_variable_(is_variable) {
+  AT_ASSERT(type_id == UndefinedTensorId() || data_type.id() ==  caffe2::TypeIdentifier::uninitialized() ||
+            device_opt_.has_value());
+  // we would also like to check that non-cpu devices have an index, but some Caffe2 operators create
+  // Storages with default devices.
   strides_.push_back(1);
 }
 
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 24ddaac302..bef73f0e52 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -213,23 +213,21 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   TensorImpl() = delete;
 
   /**
-   * Construct a 1-dim 0-size tensor with the given settings.
-   * The provided allocator will be used to allocate data on
-   * subsequent resize.
+   * Construct a 1-dim 0-size tensor backed by the given storage.
    */
-  TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable);
+  TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable);
 
   /**
-   * Construct a 1-dim 0-size tensor backed by the given storage.
+   * Construct a 1-dim 0 size tensor that doesn't have a storage.
    */
-  TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable);
+  TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, c10::optional<c10::Device> device_opt, bool is_variable);
 
  private:
   // This constructor is private, because the data_type is redundant with
   // storage.  Still, we pass it in separately because it's easier to write
   // the initializer list if we're not worried about storage being moved out
   // from under us.
-  TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, bool is_variable);
+  TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, c10::optional<c10::Device>, bool is_variable);
 
  public:
   TensorImpl(const TensorImpl&) = delete;
@@ -361,31 +359,25 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   int64_t get_device() const {
-    // NB: This method is not virtual and tries to avoid dispatches in the common case for perf.
-    const auto tid = type_id();
-    if (tid == CUDATensorId() || tid == HIPTensorId() || tid == MSNPUTensorId() || tid == XLATensorId()) {
-      // TODO: #12934 investigate caching device on TensorImpl to avoid this vdispatch.
-      return storage().device().index();
+    if (device_opt_.has_value()) {
+      // See NOTE [c10::optional operator usage in CUDA]
+      return (*device_opt_).index();
     }
-    return get_device_slow();
+
+    AT_ERROR(
+        "tensor with backend ", toString(tensorTypeIdToBackend(type_id())),
+        " does not have a device");
   }
 
   Device device() const {
-    // Special case the common case for performance reasons
-    // TODO: This is a little convoluted so it would be good to investigate
-    // caching device on TensorImpl (#12934) to speed up device() calls in all cases.
-    const auto tid = type_id();
-    if (tid == CPUTensorId() || tid == CUDATensorId() || tid == HIPTensorId() || tid == MSNPUTensorId() ||
-        tid == XLATensorId()) {
-      // NB: storage(), not storage_, b/c of Variable.
-      const auto& mystorage = storage();
-      if (mystorage) {
-        return mystorage.device();
-      }
+    if (device_opt_.has_value()) {
+      // See NOTE [c10::optional operator usage in CUDA]
+      return *device_opt_;
     }
-    const auto device_type = computeDeviceType(tid);
-    bool not_cpu = device_type != DeviceType::CPU;
-    return Device(device_type, not_cpu ? get_device() : -1);
+
+    AT_ERROR(
+        "tensor with backend ", toString(tensorTypeIdToBackend(type_id())),
+        " does not have a device");
   }
 
   Layout layout() const {
@@ -873,15 +865,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
  private:
-  // As an optimization, get_device handles the typical CUDA Tensor case and
-  // calls get_device_slow if the tensor stores its device somewhere else
-  // (VariableImpl, SparseTensorImpl). This methods does a virtual dispatch
-  // that makes it 10-20ns slower than the special-cased CUDA Tensor case.
-  virtual int64_t get_device_slow() const {
-    AT_ERROR(
-        "get_device is not implemented for tensors with ",
-        toString(tensorTypeIdToBackend(type_id())),
-        " backend");
+  // See NOTE [c10::optional operator usage in CUDA]
+  // We probably don't want to expose this publically until
+  // the note is addressed.
+  c10::optional<c10::Device> device_opt() const {
+    return device_opt_;
   }
 
  public:
@@ -891,7 +879,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    */
   DeviceType device_type() const {
     AT_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
-    return storage_.device_type();
+    AT_ASSERT(device_opt_.has_value());
+    // See NOTE [c10::optional operator usage in CUDA]
+    return (*device_opt_).type();
   }
 
   /**
@@ -899,7 +889,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * device).
    */
   Device GetDevice() const {
-    return storage_.device();
+    // See NOTE [c10::optional operator usage in CUDA]
+    return *device_opt_;
   }
 
   /**
@@ -1125,6 +1116,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
      */
     storage_ = src.storage();
     data_type_ = src.dtype();
+    device_opt_ = src.device_opt();
     storage_offset_ = src.storage_offset();
   }
 
@@ -1143,6 +1135,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       storage_.UniqueStorageShareExternalPointer(
           std::move(data_ptr), data_type, capacity);
       data_type_ = data_type;
+      device_opt_ = storage_.device();
       storage_offset_ = 0;
     } else {
       int64_t numel = capacity / data_type.itemsize();
@@ -1154,6 +1147,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
           /*allocator=*/nullptr,
           /*resizable=*/false);
       data_type_ = data_type;
+      device_opt_ = storage_.device();
       storage_offset_ = 0;
     }
   }
@@ -1184,6 +1178,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
         }
       }
       data_type_ = meta;
+      // NB: device is not changed
 
       // We can reuse the existing buffer if the current data does not have
       // a special destructor and the new data doesn't have a special
@@ -1219,6 +1214,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       }
       storage_.set_numel(numel_);
       AT_ASSERT(storage_offset_ == 0); // because we just reallocated
+      device_opt_ = storage_.device();
       return storage_.data();
     }
   }
@@ -1264,6 +1260,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     AT_CHECK(allow_tensor_metadata_change(), "set_storage is not allowed on Tensor created from .data or .detach()");
     storage_ = std::move(storage);
     data_type_ = storage_.dtype();
+    device_opt_ = storage_.device();
   }
 
 private:
@@ -1401,6 +1398,17 @@ protected:
   // agree with the type meta in storage
   caffe2::TypeMeta data_type_;
 
+  // NOTE [c10::optional operator usage in CUDA]
+  // Our optional definition doesn't compile in .cu file if `value()` or
+  // `operator->` are used.  Instead, we always use `operator*`.
+  // See https://github.com/pytorch/pytorch/issues/18496 for more info.
+  // If this is too burdensome to maintain, we can just
+  // manually implement this with an additional bool.
+
+  // INVARIANT: When storage is non-null, this Device must
+  // agree with the type meta in storage.
+  c10::optional<c10::Device> device_opt_;
+
   // You get to have eight byte-size fields here, before you
   // should pack this into a bitfield.
   TensorTypeId type_id_;
@@ -1475,12 +1483,13 @@ protected:
 //    storage offset
 //    numel
 //    data type pointer
+//    (optional) device
 //    autograd metadata pointer
 //    PyObject pointer
 //    miscellaneous bitfield
 //
 static_assert(sizeof(void*) != sizeof(int64_t) || // if 64-bit...
-              sizeof(TensorImpl) == sizeof(int64_t) * 26,
+              sizeof(TensorImpl) == sizeof(int64_t) * 27,
               "You changed the size of TensorImpl on 64-bit arch."
               "See Note [TensorImpl size constraints] on how to proceed.");
 
diff --git a/c10/core/UndefinedTensorImpl.cpp b/c10/core/UndefinedTensorImpl.cpp
index badc5ec7e4..472125467a 100644
--- a/c10/core/UndefinedTensorImpl.cpp
+++ b/c10/core/UndefinedTensorImpl.cpp
@@ -5,7 +5,7 @@ namespace c10 {
 
 // should this use the globalContext?  Can it get a context passed in somehow?
 UndefinedTensorImpl::UndefinedTensorImpl()
-: TensorImpl(UndefinedTensorId(), caffe2::TypeMeta(), nullptr, /* is variable */ false) {
+: TensorImpl(UndefinedTensorId(), caffe2::TypeMeta(), c10::nullopt, /* is variable */ false) {
 }
 
 IntArrayRef UndefinedTensorImpl::sizes() const {
author	Gregory Chanan <gchanan@fb.com>	2019-04-05 07:18:38 -0700
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-04-05 07:21:39 -0700
commit	043e363c6c2bea3652ace954e4e70707cad0cf5e (patch)
tree	556dfe2af313f2b292177105a10642924a8e0277 /c10
parent	b7c830b916879a69a5ef361b3cbdc73cd833f92d (diff)
download	pytorch-043e363c6c2bea3652ace954e4e70707cad0cf5e.tar.gz pytorch-043e363c6c2bea3652ace954e4e70707cad0cf5e.tar.bz2 pytorch-043e363c6c2bea3652ace954e4e70707cad0cf5e.zip