diff options
author | Ellie Wen <dwen@fb.com> | 2018-03-30 15:36:53 -0700 |
---|---|---|
committer | Orion Reblitz-Richardson <orionr@gmail.com> | 2018-03-30 21:00:44 -0700 |
commit | 363a227d1928b9427fad5624785128f026cf74db (patch) | |
tree | 8d927e988cc855ef951a88eb57bfd2ebc26038d0 | |
parent | 551d5fbf9a56769529307e075c17f0f9d5a80fda (diff) | |
download | pytorch-363a227d1928b9427fad5624785128f026cf74db.tar.gz pytorch-363a227d1928b9427fad5624785128f026cf74db.tar.bz2 pytorch-363a227d1928b9427fad5624785128f026cf74db.zip |
extend bucketize op to support duplicated boundries
upgrade bucketize op to support duplicated boundaries
-rw-r--r-- | caffe2/operators/one_hot_ops.cc | 24 | ||||
-rw-r--r-- | caffe2/python/operator_test/one_hot_ops_test.py | 21 |
2 files changed, 34 insertions, 11 deletions
diff --git a/caffe2/operators/one_hot_ops.cc b/caffe2/operators/one_hot_ops.cc index fdea2279be..2347c9ed66 100644 --- a/caffe2/operators/one_hot_ops.cc +++ b/caffe2/operators/one_hot_ops.cc @@ -125,11 +125,19 @@ bool BatchBucketOneHotOp<CPUContext>::RunOnDevice() { for (TIndex j = 0; j < D; j++) { // here we assume the boundary values for each feature are sorted - TIndex bucket_idx = std::lower_bound( - boundaries_offset, - boundaries_offset + lens_data[j], - input_data[pos]) - + TIndex lower_bucket_idx = std::lower_bound( + boundaries_offset, + boundaries_offset + lens_data[j], + input_data[pos]) - boundaries_offset; + + TIndex upper_bucket_idx = std::upper_bound( + boundaries_offset, + boundaries_offset + lens_data[j], + input_data[pos]) - + boundaries_offset; + + TIndex bucket_idx = (lower_bucket_idx + upper_bucket_idx) / 2; output_data[i * output_dim + output_offset + bucket_idx] = 1.0; boundaries_offset += lens_data[j]; output_offset += (lens_data[j] + 1); @@ -196,11 +204,13 @@ Note that each bucket is right-inclusive. That is, given boundary values [b1, b2, b3], the buckets are defined as (-int, b1], (b1, b2], (b2, b3], (b3, inf). For example - If data = [[2, 3], [4, 1], [2, 5]], lengths = [2, 3], - and boundaries = [0.1, 2.5, 1, 3.1, 4.5], then - + data = [[2, 3], [4, 1], [2, 5]], lengths = [2, 3], + If boundaries = [0.1, 2.5, 1, 3.1, 4.5], then output = [[0, 1, 0, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1]] + If boundaries = [0.1, 2.5, 1, 1, 3.1], then + output = [[0, 1, 0, 0, 0, 1, 0], [0, 0, 1, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 1]] + )DOC") .Input(0, "data", "input tensor matrix") .Input(1, "lengths", "the size is the same as the width of the `data`") diff --git a/caffe2/python/operator_test/one_hot_ops_test.py b/caffe2/python/operator_test/one_hot_ops_test.py index 5d20e3ce98..edc850621f 100644 --- a/caffe2/python/operator_test/one_hot_ops_test.py +++ b/caffe2/python/operator_test/one_hot_ops_test.py @@ -58,14 +58,20 @@ class TestOneHotOps(hu.HypothesisTestCase): @given( x=hu.tensor( min_dim=2, max_dim=2, dtype=np.float32, - elements=st.floats(min_value=-5, max_value=5)), + elements=st.integers(min_value=-5, max_value=5)), + seed=st.integers(min_value=0, max_value=1000), **hu.gcs_cpu_only) - def test_batch_bucketized_one_hot(self, x, gc, dc): + def test_batch_bucketized_one_hot(self, x, seed, gc, dc): + np.random.seed(seed) d = x.shape[1] lens = np.random.randint(low=1, high=5, size=d) boundaries = [] for i in range(d): - cur_boundary = np.random.randn(lens[i]) * 5 + # add [0, 0] as duplicated bounary for heuristic bucketization + if lens[i] > 2: + cur_boundary = np.append(np.random.randn(lens[i] - 2) * 5, [0, 0]) + else: + cur_boundary = np.random.randn(lens[i]) * 5 cur_boundary.sort() boundaries += cur_boundary.tolist() @@ -78,11 +84,18 @@ class TestOneHotOps(hu.HypothesisTestCase): boundary_offset = 0 output_offset = 0 for i, l in enumerate(lens): - bucket_idx = np.digitize( + bucket_idx_right = np.digitize( x[:, i], boundaries[boundary_offset:boundary_offset + l], right=True ) + bucket_idx_left = np.digitize( + x[:, i], + boundaries[boundary_offset:boundary_offset + l], + right=False + ) + bucket_idx = np.floor_divide( + np.add(bucket_idx_right, bucket_idx_left), 2) for j in range(x.shape[0]): ret[j, output_offset + bucket_idx[j]] = 1.0 boundary_offset += lens[i] |