From a5b92ea0bc8bebd388caeb92cf1c10041c46b158 Mon Sep 17 00:00:00 2001 From: Tae-Young Chung Date: Thu, 20 May 2021 09:57:02 +0900 Subject: Add PoseDecoder and Landmark to decode various type of pose output tensor Change-Id: I8be806ff3522aec1f7026912b8c317055e9e16db Signed-off-by: Tae-Young Chung --- .../mv_inference/inference/include/Landmark.h | 53 +++ .../mv_inference/inference/include/PoseDecoder.h | 95 ++++ .../mv_inference/inference/src/PoseDecoder.cpp | 483 +++++++++++++++++++++ 3 files changed, 631 insertions(+) create mode 100644 mv_machine_learning/mv_inference/inference/include/Landmark.h create mode 100644 mv_machine_learning/mv_inference/inference/include/PoseDecoder.h create mode 100644 mv_machine_learning/mv_inference/inference/src/PoseDecoder.cpp diff --git a/mv_machine_learning/mv_inference/inference/include/Landmark.h b/mv_machine_learning/mv_inference/inference/include/Landmark.h new file mode 100644 index 00000000..63ccf60c --- /dev/null +++ b/mv_machine_learning/mv_inference/inference/include/Landmark.h @@ -0,0 +1,53 @@ +/** + * Copyright (c) 2021 Samsung Electronics Co., Ltd All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __MEDIA_VISION_LANDMARK_H__ +#define __MEDIA_VISION_LANDMARK_H__ + +#include +#include +#include +#include + +/** + * @file Landmark.h + * @brief This file contains the Landmark class definition which + * provides landmark information. + */ + +namespace mediavision +{ +namespace inference +{ + typedef struct _LandmarkPoint + { + float score; + cv::Point heatMapLoc; + cv::Point2f decodedLoc; + int id; + bool valid; + } LandmarkPoint; + + typedef struct _LandmarkResults + { + std::vector landmarks; + float score; + } LandmarkResults; + +} /* Inference */ +} /* MediaVision */ + +#endif /* __MEDIA_VISION_LANDMARK_H__ */ diff --git a/mv_machine_learning/mv_inference/inference/include/PoseDecoder.h b/mv_machine_learning/mv_inference/inference/include/PoseDecoder.h new file mode 100644 index 00000000..c910d620 --- /dev/null +++ b/mv_machine_learning/mv_inference/inference/include/PoseDecoder.h @@ -0,0 +1,95 @@ +/** + * Copyright (c) 2021 Samsung Electronics Co., Ltd All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __MEDIA_VISION_POSEDECODER_H__ +#define __MEDIA_VISION_POSEDECODER_H__ + +#include +#include +#include +#include + + +#include "TensorBuffer.h" +#include "OutputMetadata.h" +#include "Landmark.h" + +/** + * @file PoseDecoder.h + * @brief This file contains the PoseDecoder class definition which + * provides pose decoder. + */ + +namespace mediavision +{ +namespace inference +{ + class PoseDecoder + { + private: + TensorBuffer mTensorBuffer; + OutputMetadata mMeta; + int mHeatMapWidth; + int mHeatMapHeight; + int mHeatMapChannel; + int mNumberOfLandmarks; + + std::list mCandidates; + std::vector mPoseLandmarks; + + int getIndexToPos(LandmarkPoint& point, float scaleW, float scaleH); + int getPosToIndex(LandmarkPoint& landmark); + int getOffsetValue(LandmarkPoint& landmark, cv::Point2f &offsetVal); + int findPose(LandmarkPoint& root, std::vector& decodedLandmarks, + float scaleW, float scaleH); + int traverseToNeighbor(int edgeId, int toId, int dir, + LandmarkPoint fromLandmark, LandmarkPoint& toLandmark, + float scaleW, float scaleH); + int getEdgeVector(cv::Point index, int edgeId, int dir, cv::Point2f& vector); + + int convertXYZtoX(int x, int y, int c); + + cv::Point convertXYZtoXY(int x, int y, int c); + + public: + PoseDecoder(TensorBuffer& buffer, const OutputMetadata& metaData, + int heatMapWidth, int heatMapHeight, int heatMapChannel, + int numberOfLandmarks) : + mTensorBuffer(buffer), + mHeatMapWidth(heatMapWidth), + mHeatMapHeight(heatMapHeight), + mHeatMapChannel(heatMapChannel), + mNumberOfLandmarks(numberOfLandmarks) { + mMeta = metaData; + }; + + ~PoseDecoder() = default; + + int init(); + + int decode(float scaleWidth, float scaleHeight, float thresHoldRadius); + + int getNumberOfPose(); + + float getPointX(int poseIdx, int partIdx); + float getPointY(int poseIdx, int partIdx); + float getScore(int poseIdx, int partIdx); + }; + +} /* Inference */ +} /* MediaVision */ + +#endif /* __MEDIA_VISION_POSEDECODER_H__ */ diff --git a/mv_machine_learning/mv_inference/inference/src/PoseDecoder.cpp b/mv_machine_learning/mv_inference/inference/src/PoseDecoder.cpp new file mode 100644 index 00000000..f30fbf96 --- /dev/null +++ b/mv_machine_learning/mv_inference/inference/src/PoseDecoder.cpp @@ -0,0 +1,483 @@ +/** + * Copyright (c) 2021 Samsung Electronics Co., Ltd All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "mv_private.h" +#include "PoseDecoder.h" +#include "PostProcess.h" + +#include +#include +#include + +#define MAX_NUMBER_OF_POSE 5 +#define MAX_NUMBER_OF_CORRECTION 3 + +namespace mediavision +{ +namespace inference +{ + int PoseDecoder::convertXYZtoX(int x, int y, int c) + { + return y * mHeatMapWidth * mHeatMapChannel + + x * mHeatMapChannel + + c; + } + + cv::Point PoseDecoder::convertXYZtoXY(int x, int y, int c) + { + int idxY = y * mHeatMapWidth * mHeatMapChannel * 2 + + x * mHeatMapChannel * 2 + + c; + + int idxX = idxY + mHeatMapChannel; + + return cv::Point(idxX, idxY); + } + + int PoseDecoder::init() + { + LOGI("ENTER"); + + Landmark& landmarkInfo = mMeta.GetLandmark(); + + if (landmarkInfo.GetType() < 0 || landmarkInfo.GetType() >= 3) { + LOGE("Not supported landmark type"); + return MEDIA_VISION_ERROR_INVALID_OPERATION; + } + + if (landmarkInfo.GetDecodingType() == 0) { + LOGI("Skip init"); + return MEDIA_VISION_ERROR_NONE; + } + + int x,y,c; + int sx, sy, ex, ey, dx, dy; + float score, localScore; + int idx; + bool isLocalMax; + ScoreInfo& scoreInfo = mMeta.GetScore(); + + mCandidates.clear(); + + if (landmarkInfo.GetType() == 0 || + landmarkInfo.GetType() == 2) { + mCandidates.resize(mHeatMapChannel); + } + + for (y = 0; y < mHeatMapHeight; ++y) { + for (x = 0; x < mHeatMapWidth; ++x) { + std::list::iterator candidate = mCandidates.begin(); + for (c = 0; c < mHeatMapChannel; ++c, candidate++) { + isLocalMax = true; + idx = convertXYZtoX(x, y, c); + score = mTensorBuffer.getValue(scoreInfo.GetName(), idx); + if (scoreInfo.GetType() == 1) { + score = PostProcess::sigmoid(score); + } + + if (score < scoreInfo.GetThresHold()) + continue; + + if (landmarkInfo.GetType() == 0 || + landmarkInfo.GetType() == 2) { + if (score <= candidate->score) + continue; + + candidate->score = score; + candidate->heatMapLoc.x = x; + candidate->heatMapLoc.y = y; + candidate->id = c; + + } else { //landmarkInfo.type == 1 + sx = std::max(x - 1, 0); + sy = std::max(y - 1, 0); + ex = std::min(x + 2, mHeatMapWidth); + ey = std::min(y + 2, mHeatMapHeight); + + for (dy = sy; dy < ey; ++dy) { + for (dx = sx; dx < ex; ++dx) { + idx = convertXYZtoX(dx, dy, c); + localScore = mTensorBuffer.getValue(scoreInfo.GetName(), idx); + if (scoreInfo.GetType() == 1) { + localScore = PostProcess::sigmoid(localScore); + } + if (localScore > score) { + isLocalMax = false; + break; + } + } + if (isLocalMax == false) + break; + } + + if (isLocalMax == false) + continue; + + // add this to list + LOGI("[%d x %d][%d]: score %.3f", y, x, c, score); + std::list::iterator iter; + for (iter = mCandidates.begin(); iter != mCandidates.end(); ++iter) { + if ((*iter).score < score) { + break; + } + } + + LandmarkPoint localLandmark; + localLandmark.score = score; + localLandmark.heatMapLoc.x = x; + localLandmark.heatMapLoc.y = y; + localLandmark.id = c; + localLandmark.valid = false; + mCandidates.insert(iter, localLandmark); + } + } + } + } // end of init + + LOGI("LEAVE"); + + return MEDIA_VISION_ERROR_NONE; + } + + int PoseDecoder::getNumberOfPose() + { + return std::min(static_cast(mPoseLandmarks.size()), MAX_NUMBER_OF_POSE); + } + + int PoseDecoder::getOffsetValue(LandmarkPoint& landmark, cv::Point2f &offsetVal) + { + if (!mTensorBuffer.exist(mMeta.GetOffset().GetName())) { + offsetVal.x = offsetVal.y = 0.f; + LOGI("No offset value"); + LOGI("LEAVE"); + return MEDIA_VISION_ERROR_NONE; + } + + cv::Point idx = convertXYZtoXY(landmark.heatMapLoc.x, landmark.heatMapLoc.y, landmark.id); + + try { + offsetVal.x = mTensorBuffer.getValue(mMeta.GetOffset().GetName(), idx.x); + offsetVal.y = mTensorBuffer.getValue(mMeta.GetOffset().GetName(), idx.y); + } catch (const std::exception& e) { + LOGE("Fail to get value at (%d, %d) from %s", + idx.x, idx.y, mMeta.GetOffset().GetName().c_str()); + return MEDIA_VISION_ERROR_INVALID_OPERATION; + } + + return MEDIA_VISION_ERROR_NONE; + } + + float PoseDecoder::getPointX(int poseIdx, int partIdx) + { + LOGI("idx[%d]-part[%d]", poseIdx, partIdx); + return mPoseLandmarks[poseIdx].landmarks[partIdx].decodedLoc.x; + } + + float PoseDecoder::getPointY(int poseIdx, int partIdx) + { + LOGI("idx[%d]-part[%d]", poseIdx, partIdx); + return mPoseLandmarks[poseIdx].landmarks[partIdx].decodedLoc.y; + } + + float PoseDecoder::getScore(int poseIdx, int partIdx) + { + return mPoseLandmarks[poseIdx].landmarks[partIdx].score; + } + + int PoseDecoder::getIndexToPos(LandmarkPoint& point, float scaleW, float scaleH) + { + if (scaleW <= 0.0f || scaleH <= 0.0f) { + LOGE("scale width(%.4f) or height(%.4f) is less than or equal to zero", scaleW, scaleH); + return MEDIA_VISION_ERROR_INVALID_PARAMETER; + } + + cv::Point2f offsetVal; + getOffsetValue(point, offsetVal); + + point.decodedLoc.x = static_cast(point.heatMapLoc.x) / static_cast(mHeatMapWidth - 1); + point.decodedLoc.y = static_cast(point.heatMapLoc.y) / static_cast(mHeatMapHeight - 1); + + point.decodedLoc.x += offsetVal.x / scaleW; + point.decodedLoc.y += offsetVal.y / scaleH; + + return MEDIA_VISION_ERROR_NONE; + } + + int PoseDecoder::getPosToIndex(LandmarkPoint& point) + { + cv::Point posVal; + + posVal.x = roundf(point.decodedLoc.x * static_cast(mHeatMapWidth - 1)); + posVal.y = roundf(point.decodedLoc.y * static_cast(mHeatMapHeight - 1)); + + posVal.x = std::max(std::min(posVal.x, mHeatMapWidth - 1), 0); + posVal.y = std::max(std::min(posVal.y, mHeatMapHeight - 1), 0); + + point.heatMapLoc = posVal; + + return MEDIA_VISION_ERROR_NONE; + } + + int PoseDecoder::decode(float scaleWidth, float scaleHeight, float thresHoldRadius) + { + LOGI("ENTER"); + + if (scaleWidth <= 0.0f || scaleHeight <= 0.0f) { + LOGE("scale width(%.4f) or height(%.4f) is less than or equal to zero", scaleWidth, scaleHeight); + return MEDIA_VISION_ERROR_INVALID_PARAMETER; + } + + mPoseLandmarks.clear(); + LandmarkPoint initValue = {0.0f, cv::Point(0,0), cv::Point2f(0.0f, 0.0f), -1, false}; + + Landmark& landmarkInfo = mMeta.GetLandmark(); + ScoreInfo& scoreInfo = mMeta.GetScore(); + + if (landmarkInfo.GetType() == 0 || + landmarkInfo.GetType() == 2) { // single pose + mPoseLandmarks.resize(1); + + if (landmarkInfo.GetDecodingType() == 0) { // direct decoding + mPoseLandmarks[0].landmarks.resize(mNumberOfLandmarks); + } else { // heatmap decoding + mPoseLandmarks[0].landmarks.resize(mHeatMapChannel); + } + } + + if (landmarkInfo.GetDecodingType() != 0) { // heatmap decoding + while (!mCandidates.empty()) { + + LandmarkPoint &root = mCandidates.front(); + + getIndexToPos(root, scaleWidth, scaleHeight); + + if (landmarkInfo.GetType() == 0) { + root.valid = true; + mPoseLandmarks[0].landmarks[root.id] = root; + mPoseLandmarks[0].score += root.score; + mCandidates.pop_front(); + continue; + } + + LOGI("root id: %d", root.id); + + if (thresHoldRadius > 0.0f) { + bool isSkip = false; + for (auto& result : mPoseLandmarks) { + cv::Point2f dfRadius = result.landmarks[root.id].decodedLoc; + dfRadius -= root.decodedLoc; + float radius = + std::pow(dfRadius.x * scaleWidth, 2.0f) + + std::pow(dfRadius.y * scaleHeight, 2.0f); + LOGI("id[%d], radius: %.f vs. %.f", root.id, radius, std::pow(thresHoldRadius, 2.0f)); + if (radius <= std::pow(thresHoldRadius, 2.0f)) { + LOGI("Not local maximum, Skip this"); + isSkip = true; + break; + } + } + if (isSkip) { + mCandidates.pop_front(); + continue; + } + } + + LOGI("Local maximum. Add this"); + + std::vector decodedLandmarks(mHeatMapChannel, initValue); + + findPose(root, decodedLandmarks, scaleWidth, scaleHeight); + + float poseScore = 0.0f; + for (auto& landmark : decodedLandmarks) { + poseScore += landmark.score; + LOGI("%.3f, %.3f", landmark.decodedLoc.x, landmark.decodedLoc.y); + } + + mPoseLandmarks.push_back(LandmarkResults {decodedLandmarks, poseScore}); + if (mPoseLandmarks.size() > MAX_NUMBER_OF_POSE) + break; + mCandidates.pop_front(); + } + + for (auto& pose : mPoseLandmarks) { + pose.score /= static_cast(mHeatMapChannel); + } + } else { + // multi pose is not supported + std::vector scoreIndexes = scoreInfo.GetDimInfo().GetValidIndexAll(); + float poseScore = mTensorBuffer.getValue(scoreInfo.GetName(), scoreIndexes[scoreIndexes[0]]); + if (scoreInfo.GetType() == 1) { + poseScore = PostProcess::sigmoid(poseScore); + } + if (poseScore < scoreInfo.GetThresHold()) { + LOGI("pose score %.4f is lower than %.4f", poseScore, scoreInfo.GetThresHold()); + LOGI("LEAVE"); + return MEDIA_VISION_ERROR_NONE; + } + + int landmarkOffset = (landmarkInfo.GetType() == 0 || landmarkInfo.GetType() == 1) ? 2 : 3; + if (landmarkInfo.GetDecodingType() == 0) { + landmarkOffset = landmarkInfo.GetOffset(); + } + for (int idx = 0; idx < mNumberOfLandmarks; ++idx) { + float px = mTensorBuffer.getValue(landmarkInfo.GetName(), idx * landmarkOffset); + float py = mTensorBuffer.getValue(landmarkInfo.GetName(), idx * landmarkOffset + 1); + + mPoseLandmarks[0].landmarks[idx].score = poseScore; + mPoseLandmarks[0].landmarks[idx].heatMapLoc = cv::Point(-1, -1); + mPoseLandmarks[0].landmarks[idx].decodedLoc = cv::Point2f(px/scaleWidth, py/scaleHeight); + mPoseLandmarks[0].landmarks[idx].id = idx; + mPoseLandmarks[0].landmarks[idx].valid = true; + + LOGI("idx[%d]: %.4f, %.4f", idx, px, py); + } + + mPoseLandmarks[0].score = poseScore; + } + + LOGI("LEAVE"); + return MEDIA_VISION_ERROR_NONE; + } + + int PoseDecoder::findPose(LandmarkPoint& root, std::vector& decodedLandmarks, + float scaleW, float scaleH) + { + LOGI("ENTER"); + + if (scaleW <= 0.0f || scaleH <= 0.0f) { + LOGE("scale width(%.4f) or height(%.4f) is less than or equal to zero", scaleW, scaleH); + return MEDIA_VISION_ERROR_INVALID_PARAMETER; + } + + decodedLandmarks[root.id] = root; + decodedLandmarks[root.id].valid = true; + LOGI("KeyId: [%d], heatMap: %d, %d", root.id, root.heatMapLoc.x, root.heatMapLoc.y); + LOGI("KeyId: [%d], decoded: %.4f, %.4f, score %.3f", root.id, root.decodedLoc.x, root.decodedLoc.y, root.score); + + int index = static_cast(mMeta.GetEdge().GetEdgesAll().size()) - 1; + for (auto riter = mMeta.GetEdge().GetEdgesAll().rbegin(); + riter != mMeta.GetEdge().GetEdgesAll().rend(); ++riter) { + int fromKeyId = riter->second; + int toKeyId = riter->first; + + if (decodedLandmarks[fromKeyId].valid == true && + decodedLandmarks[toKeyId].valid == false) { + LOGI("BackTravers: from %d to %d", fromKeyId, toKeyId); + traverseToNeighbor(index, toKeyId, 1, + decodedLandmarks[fromKeyId], decodedLandmarks[toKeyId], + scaleW, scaleH); + LOGI("tgt_key_id[%d]: %.4f, %.4f, %.4f", toKeyId, + decodedLandmarks[toKeyId].decodedLoc.x, + decodedLandmarks[toKeyId].decodedLoc.y, + decodedLandmarks[toKeyId].score); + } + index--; + } + + index = 0; + for (auto iter = mMeta.GetEdge().GetEdgesAll().begin(); + iter != mMeta.GetEdge().GetEdgesAll().end(); ++iter) { + int fromKeyId = iter->first; + int toKeyId = iter->second; + + if (decodedLandmarks[fromKeyId].valid == true && + decodedLandmarks[toKeyId].valid == false) { + LOGI("FrwdTravers: form %d to %d", fromKeyId, toKeyId); + traverseToNeighbor(index, toKeyId, 0, + decodedLandmarks[fromKeyId], decodedLandmarks[toKeyId], + scaleW, scaleH); + } + index++; + } + LOGI("LEAVE"); + return MEDIA_VISION_ERROR_NONE; + } + + int PoseDecoder::traverseToNeighbor(int edgeId, int toId, int dir, + LandmarkPoint fromLandmark, LandmarkPoint& toLandmark, + float scaleW, float scaleH) + { + if (scaleW <= 0.0f || scaleH <= 0.0f) { + LOGE("scale width(%.4f) or height(%.4f) is less than or equal to zero", scaleW, scaleH); + return MEDIA_VISION_ERROR_INVALID_PARAMETER; + } + + cv::Point2f edgeVector(0.f, 0.f); + cv::Point nearHeatMapLoc; + + LOGI("org: %.4f, %.4f", fromLandmark.decodedLoc.x, fromLandmark.decodedLoc.y); + + // update heatMapLoc from decodedLoc; + nearHeatMapLoc.x = roundf(fromLandmark.decodedLoc.x + * static_cast(mHeatMapWidth - 1)); + nearHeatMapLoc.y = roundf(fromLandmark.decodedLoc.y + * static_cast(mHeatMapHeight - 1)); + + nearHeatMapLoc.x = std::max(std::min(nearHeatMapLoc.x, mHeatMapWidth - 1), 0); + nearHeatMapLoc.y = std::max(std::min(nearHeatMapLoc.y, mHeatMapHeight - 1), 0); + + LOGI("src: %d, %d", nearHeatMapLoc.x, nearHeatMapLoc.y); + + getEdgeVector(nearHeatMapLoc, edgeId, dir, edgeVector); + + LOGI("vector: %.4f, %.4f with edgeId %d", edgeVector.x, edgeVector.y, edgeId); + toLandmark.decodedLoc.x = fromLandmark.decodedLoc.x + edgeVector.x / scaleW; + toLandmark.decodedLoc.y = fromLandmark.decodedLoc.y + edgeVector.y / scaleH; + toLandmark.id = toId; + LOGI("tgt: %.4f, %.4f", toLandmark.decodedLoc.x, toLandmark.decodedLoc.y); + + for (int iter = 0; iter < MAX_NUMBER_OF_CORRECTION; ++iter) { + getPosToIndex(toLandmark); + getIndexToPos(toLandmark, scaleW, scaleH); + } + + int idx = convertXYZtoX(toLandmark.heatMapLoc.x, toLandmark.heatMapLoc.y, toLandmark.id); + toLandmark.score = mTensorBuffer.getValue(mMeta.GetScore().GetName(), idx); + if (mMeta.GetScore().GetType() == 1) { + toLandmark.score = PostProcess::sigmoid(toLandmark.score); + } + + toLandmark.valid = true; + LOGI("Final: %.4f, %.4f", toLandmark.decodedLoc.x, toLandmark.decodedLoc.y); + + return MEDIA_VISION_ERROR_NONE; + } + + int PoseDecoder::getEdgeVector(cv::Point index, int edgeId, int dir, cv::Point2f& vector) + { + LOGI("ENTER"); + + LOGI("edge size: %zd", mMeta.GetEdge().GetEdgesAll().size()); + int idxY = index.y * mHeatMapWidth + * static_cast(mMeta.GetEdge().GetEdgesAll().size()) * 2; + idxY += index.x * static_cast(mMeta.GetEdge().GetEdgesAll().size()) * 2 + edgeId; + + int idxX = idxY + static_cast(mMeta.GetEdge().GetEdgesAll().size()); + + for(auto& dispVec : mMeta.GetDispVecAll()){ + if (dispVec.GetType() == dir) { // 0: forward + LOGI("%s", dispVec.GetName().c_str()); + vector.x = mTensorBuffer.getValue(dispVec.GetName(), idxX); + vector.y = mTensorBuffer.getValue(dispVec.GetName(), idxY); + } + } + + LOGI("LEAVE"); + return MEDIA_VISION_ERROR_NONE; + } +} +} -- cgit v1.2.3