#include "caffe2/core/common_gpu.h" #include "caffe2/core/context_gpu.h" #include "caffe2/sgd/adam_op.h" namespace caffe2 { __global__ void AdamUpdate( int N, const float* g, const float* m, const float* v, float* ng, float* nm, float* nv, float beta1, float beta2, float eps_hat, float correction, const float* lr) { CUDA_1D_KERNEL_LOOP(i, N) { float gi = g[i]; float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1); float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2); ng[i] = lr[0] * correction * mi / (sqrtf(vi) + eps_hat); } } template <> void adam_update( int N, const float* g, const float* m, const float* v, float* ng, float* nm, float* nv, float beta1, float beta2, float eps_hat, float correction, const float* lr, CUDAContext* context) { AdamUpdate<<< CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( N, g, m, v, ng, nm, nv, beta1, beta2, eps_hat, correction, lr); } __global__ void AdamCompute( int N, const float* w, const float* g, const float* m, const float* v, float* nw, float* nm, float* nv, float beta1, float beta2, float eps_hat, float correction, const float* lr) { CUDA_1D_KERNEL_LOOP(i, N) { float gi = g[i]; float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1); float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2); float ng = lr[0] * correction * mi / (sqrtf(vi) + eps_hat); nw[i] = w[i] + ng; } } template <> void adam_compute( int N, const float* w, const float* g, const float* m, const float* v, float* nw, float* nm, float* nv, float beta1, float beta2, float eps_hat, float correction, const float* lr, CUDAContext* context) { AdamCompute<<< CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( N, w, g, m, v, nw, nm, nv, beta1, beta2, eps_hat, correction, lr); } __global__ void AdamComputeOutputGrad( int N, const float* w, const float* g, const float* m, const float* v, float* nw, float* nm, float* nv, float* ng, float beta1, float beta2, float eps_hat, float correction, const float* lr) { CUDA_1D_KERNEL_LOOP(i, N) { float gi = g[i]; float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1); float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2); float ngi = ng[i] = correction * mi / (sqrtf(vi) + eps_hat); nw[i] = w[i] + lr[0] * ngi; } } template <> void adam_compute_output_grad( int N, const float* w, const float* g, const float* m, const float* v, float* nw, float* nm, float* nv, float* ng, float beta1, float beta2, float eps_hat, float correction, const float* lr, CUDAContext* context) { AdamComputeOutputGrad<<< CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( N, w, g, m, v, nw, nm, nv, ng, beta1, beta2, eps_hat, correction, lr); } template __global__ void SparseAdamKernel( const size_t N, const size_t grad_slice_sz, const float beta1, const float beta2, const float epsilon, float* param, float* mom1, float* mom2, const SIndex* indices, const float* grad, const float correction, const float* lr, const float iter) { CUDA_1D_KERNEL_LOOP(i, N) { const size_t gradIdx = i; const SIndex index = indices[i / grad_slice_sz]; const size_t paramIdx = index * grad_slice_sz + (i % grad_slice_sz); float m1n = mom1[paramIdx] = mom1[paramIdx] * beta1 + grad[gradIdx] * (1.0f - beta1); float m2n = mom2[paramIdx] = mom2[paramIdx] * beta2 + grad[gradIdx] * grad[gradIdx] * (1.0f - beta2); param[paramIdx] += lr[0] * correction * m1n / (sqrt(m2n) + epsilon); } } template __global__ void SparseAdamOutputGradKernel( const size_t N, const size_t grad_slice_sz, const float beta1, const float beta2, const float epsilon, float* param, float* mom1, float* mom2, float* output_grad, const SIndex* indices, const float* grad, const float correction, const float* lr, const float iter) { CUDA_1D_KERNEL_LOOP(i, N) { const size_t gradIdx = i; const SIndex index = indices[i / grad_slice_sz]; const size_t paramIdx = index * grad_slice_sz + (i % grad_slice_sz); float m1n = mom1[paramIdx] = mom1[paramIdx] * beta1 + grad[gradIdx] * (1.0f - beta1); float m2n = mom2[paramIdx] = mom2[paramIdx] * beta2 + grad[gradIdx] * grad[gradIdx] * (1.0f - beta2); float gradOut = output_grad[gradIdx] = correction * m1n / (sqrt(m2n) + epsilon); param[paramIdx] += lr[0] * gradOut; } } template <> template bool SparseAdamOp::DoRunWithType() { Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM)); Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1)); Output(OUTPUT_MOMENT_2)->ResizeLike(Input(MOMENT_2)); auto N = Input(GRAD).size(); auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim()); const auto iter = OperatorBase::Input(ITER, CPU).template data()[0]; const float correction = sqrtf(1.0f - std::pow(beta2_, iter + 1)) / (1.0f - std::pow(beta1_, iter + 1)); if (OutputSize() == 3) { SparseAdamKernel <<>>( N, grad_slice_sz, beta1_, beta2_, epsilon_, Output(OUTPUT_PARAM)->template mutable_data(), Output(OUTPUT_MOMENT_1)->template mutable_data(), Output(OUTPUT_MOMENT_2)->template mutable_data(), Input(INDICES).template data(), Input(GRAD).template data(), correction, Input(LR).template data(), iter); } else { Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); SparseAdamOutputGradKernel <<>>( N, grad_slice_sz, beta1_, beta2_, epsilon_, Output(OUTPUT_PARAM)->template mutable_data(), Output(OUTPUT_MOMENT_1)->template mutable_data(), Output(OUTPUT_MOMENT_2)->template mutable_data(), Output(OUTPUT_GRAD)->template mutable_data(), Input(INDICES).template data(), Input(GRAD).template data(), correction, Input(LR).template data(), iter); } return true; } REGISTER_CUDA_OPERATOR(Adam, AdamOp); REGISTER_CUDA_OPERATOR(SparseAdam, SparseAdamOp); } // namespace caffe2