extend bucketize op to support duplicated boundries

upgrade bucketize op to support duplicated boundaries
author: Ellie Wen <dwen@fb.com> 2018-03-30 15:36:53 -0700
committer: Orion Reblitz-Richardson <orionr@gmail.com> 2018-03-30 21:00:44 -0700
commit: 363a227d1928b9427fad5624785128f026cf74db (patch)
tree: 8d927e988cc855ef951a88eb57bfd2ebc26038d0
parent: 551d5fbf9a56769529307e075c17f0f9d5a80fda (diff)
download: pytorch-363a227d1928b9427fad5624785128f026cf74db.tar.gz
pytorch-363a227d1928b9427fad5624785128f026cf74db.tar.bz2
pytorch-363a227d1928b9427fad5624785128f026cf74db.zip
2 files changed, 34 insertions, 11 deletions
diff --git a/caffe2/operators/one_hot_ops.cc b/caffe2/operators/one_hot_ops.cc
index fdea2279be..2347c9ed66 100644
--- a/caffe2/operators/one_hot_ops.cc
+++ b/caffe2/operators/one_hot_ops.cc
@@ -125,11 +125,19 @@ bool BatchBucketOneHotOp<CPUContext>::RunOnDevice() {
 
     for (TIndex j = 0; j < D; j++) {
       // here we assume the boundary values for each feature are sorted
-      TIndex bucket_idx = std::lower_bound(
-                              boundaries_offset,
-                              boundaries_offset + lens_data[j],
-                              input_data[pos]) -
+      TIndex lower_bucket_idx = std::lower_bound(
+                                    boundaries_offset,
+                                    boundaries_offset + lens_data[j],
+                                    input_data[pos]) -
           boundaries_offset;
+
+      TIndex upper_bucket_idx = std::upper_bound(
+                                    boundaries_offset,
+                                    boundaries_offset + lens_data[j],
+                                    input_data[pos]) -
+          boundaries_offset;
+
+      TIndex bucket_idx = (lower_bucket_idx + upper_bucket_idx) / 2;
       output_data[i * output_dim + output_offset + bucket_idx] = 1.0;
       boundaries_offset += lens_data[j];
       output_offset += (lens_data[j] + 1);
@@ -196,11 +204,13 @@ Note that each bucket is right-inclusive. That is, given boundary values
 [b1, b2, b3], the buckets are defined as (-int, b1], (b1, b2], (b2, b3], (b3, inf).
 For example
 
-  If data = [[2, 3], [4, 1], [2, 5]], lengths = [2, 3],
-  and boundaries = [0.1, 2.5, 1, 3.1, 4.5], then
-
+  data = [[2, 3], [4, 1], [2, 5]], lengths = [2, 3],
+  If boundaries = [0.1, 2.5, 1, 3.1, 4.5], then
   output = [[0, 1, 0, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1]]
 
+  If boundaries = [0.1, 2.5, 1, 1, 3.1], then
+  output = [[0, 1, 0, 0, 0, 1, 0], [0, 0, 1, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 1]]
+
 )DOC")
     .Input(0, "data", "input tensor matrix")
     .Input(1, "lengths", "the size is the same as the width of the `data`")
diff --git a/caffe2/python/operator_test/one_hot_ops_test.py b/caffe2/python/operator_test/one_hot_ops_test.py
index 5d20e3ce98..edc850621f 100644
--- a/caffe2/python/operator_test/one_hot_ops_test.py
+++ b/caffe2/python/operator_test/one_hot_ops_test.py
@@ -58,14 +58,20 @@ class TestOneHotOps(hu.HypothesisTestCase):
     @given(
         x=hu.tensor(
             min_dim=2, max_dim=2, dtype=np.float32,
-            elements=st.floats(min_value=-5, max_value=5)),
+            elements=st.integers(min_value=-5, max_value=5)),
+        seed=st.integers(min_value=0, max_value=1000),
         **hu.gcs_cpu_only)
-    def test_batch_bucketized_one_hot(self, x, gc, dc):
+    def test_batch_bucketized_one_hot(self, x, seed, gc, dc):
+        np.random.seed(seed)
         d = x.shape[1]
         lens = np.random.randint(low=1, high=5, size=d)
         boundaries = []
         for i in range(d):
-            cur_boundary = np.random.randn(lens[i]) * 5
+            # add [0, 0] as duplicated bounary for heuristic bucketization
+            if lens[i] > 2:
+                cur_boundary = np.append(np.random.randn(lens[i] - 2) * 5, [0, 0])
+            else:
+                cur_boundary = np.random.randn(lens[i]) * 5
             cur_boundary.sort()
             boundaries += cur_boundary.tolist()
 
@@ -78,11 +84,18 @@ class TestOneHotOps(hu.HypothesisTestCase):
             boundary_offset = 0
             output_offset = 0
             for i, l in enumerate(lens):
-                bucket_idx = np.digitize(
+                bucket_idx_right = np.digitize(
                     x[:, i],
                     boundaries[boundary_offset:boundary_offset + l],
                     right=True
                 )
+                bucket_idx_left = np.digitize(
+                    x[:, i],
+                    boundaries[boundary_offset:boundary_offset + l],
+                    right=False
+                )
+                bucket_idx = np.floor_divide(
+                    np.add(bucket_idx_right, bucket_idx_left), 2)
                 for j in range(x.shape[0]):
                     ret[j, output_offset + bucket_idx[j]] = 1.0
                 boundary_offset += lens[i]
author	Ellie Wen <dwen@fb.com>	2018-03-30 15:36:53 -0700
committer	Orion Reblitz-Richardson <orionr@gmail.com>	2018-03-30 21:00:44 -0700
commit	363a227d1928b9427fad5624785128f026cf74db (patch)
tree	8d927e988cc855ef951a88eb57bfd2ebc26038d0
parent	551d5fbf9a56769529307e075c17f0f9d5a80fda (diff)
download	pytorch-363a227d1928b9427fad5624785128f026cf74db.tar.gz pytorch-363a227d1928b9427fad5624785128f026cf74db.tar.bz2 pytorch-363a227d1928b9427fad5624785128f026cf74db.zip