extend bucketize op to support duplicated boundries

upgrade bucketize op to support duplicated boundaries
author: Ellie Wen <dwen@fb.com> 2018-03-30 15:36:53 -0700
committer: Orion Reblitz-Richardson <orionr@gmail.com> 2018-03-30 21:00:44 -0700
commit: 363a227d1928b9427fad5624785128f026cf74db (patch)
tree: 8d927e988cc855ef951a88eb57bfd2ebc26038d0 /caffe2/operators
parent: 551d5fbf9a56769529307e075c17f0f9d5a80fda (diff)
download: pytorch-363a227d1928b9427fad5624785128f026cf74db.tar.gz
pytorch-363a227d1928b9427fad5624785128f026cf74db.tar.bz2
pytorch-363a227d1928b9427fad5624785128f026cf74db.zip
1 files changed, 17 insertions, 7 deletions
diff --git a/caffe2/operators/one_hot_ops.cc b/caffe2/operators/one_hot_ops.cc
index fdea2279be..2347c9ed66 100644
--- a/caffe2/operators/one_hot_ops.cc
+++ b/caffe2/operators/one_hot_ops.cc
@@ -125,11 +125,19 @@ bool BatchBucketOneHotOp<CPUContext>::RunOnDevice() {
 
     for (TIndex j = 0; j < D; j++) {
       // here we assume the boundary values for each feature are sorted
-      TIndex bucket_idx = std::lower_bound(
-                              boundaries_offset,
-                              boundaries_offset + lens_data[j],
-                              input_data[pos]) -
+      TIndex lower_bucket_idx = std::lower_bound(
+                                    boundaries_offset,
+                                    boundaries_offset + lens_data[j],
+                                    input_data[pos]) -
           boundaries_offset;
+
+      TIndex upper_bucket_idx = std::upper_bound(
+                                    boundaries_offset,
+                                    boundaries_offset + lens_data[j],
+                                    input_data[pos]) -
+          boundaries_offset;
+
+      TIndex bucket_idx = (lower_bucket_idx + upper_bucket_idx) / 2;
       output_data[i * output_dim + output_offset + bucket_idx] = 1.0;
       boundaries_offset += lens_data[j];
       output_offset += (lens_data[j] + 1);
@@ -196,11 +204,13 @@ Note that each bucket is right-inclusive. That is, given boundary values
 [b1, b2, b3], the buckets are defined as (-int, b1], (b1, b2], (b2, b3], (b3, inf).
 For example
 
-  If data = [[2, 3], [4, 1], [2, 5]], lengths = [2, 3],
-  and boundaries = [0.1, 2.5, 1, 3.1, 4.5], then
-
+  data = [[2, 3], [4, 1], [2, 5]], lengths = [2, 3],
+  If boundaries = [0.1, 2.5, 1, 3.1, 4.5], then
   output = [[0, 1, 0, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1]]
 
+  If boundaries = [0.1, 2.5, 1, 1, 3.1], then
+  output = [[0, 1, 0, 0, 0, 1, 0], [0, 0, 1, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 1]]
+
 )DOC")
     .Input(0, "data", "input tensor matrix")
     .Input(1, "lengths", "the size is the same as the width of the `data`")
author	Ellie Wen <dwen@fb.com>	2018-03-30 15:36:53 -0700
committer	Orion Reblitz-Richardson <orionr@gmail.com>	2018-03-30 21:00:44 -0700
commit	363a227d1928b9427fad5624785128f026cf74db (patch)
tree	8d927e988cc855ef951a88eb57bfd2ebc26038d0 /caffe2/operators
parent	551d5fbf9a56769529307e075c17f0f9d5a80fda (diff)
download	pytorch-363a227d1928b9427fad5624785128f026cf74db.tar.gz pytorch-363a227d1928b9427fad5624785128f026cf74db.tar.bz2 pytorch-363a227d1928b9427fad5624785128f026cf74db.zip