From 2cb1b69ca246be5bef11c90fb36af177d8323aaf Mon Sep 17 00:00:00 2001 From: Tae-Young Chung Date: Wed, 17 Aug 2022 09:37:32 +0900 Subject: Update Landmark to get multiple outputs In case of mediapipe's face with attention model, it outputs multiple landmarks such as facial landmark, eyes, lips, iris. To support that, Landmarks is changed to get multiple landmark outputs. Change-Id: Ia871a53b7e82af27b367ee6bde3f295e27e98729 Signed-off-by: Tae-Young Chung --- .../mv_inference/inference/include/Landmark.h | 123 ++++++++------------- .../inference/include/OutputMetadata.h | 21 ++-- .../mv_inference/inference/include/PoseDecoder.h | 6 +- .../mv_inference/inference/src/Inference.cpp | 100 +++++++++-------- .../mv_inference/inference/src/OutputMetadata.cpp | 76 +++++++------ .../mv_inference/inference/src/PoseDecoder.cpp | 52 +++++---- .../inference/mv_facestream_test_suite.cpp | 60 ++++++++-- 7 files changed, 232 insertions(+), 206 deletions(-) diff --git a/mv_machine_learning/mv_inference/inference/include/Landmark.h b/mv_machine_learning/mv_inference/inference/include/Landmark.h index c92e6ae1..f7a79292 100644 --- a/mv_machine_learning/mv_inference/inference/include/Landmark.h +++ b/mv_machine_learning/mv_inference/inference/include/Landmark.h @@ -108,41 +108,32 @@ namespace inference int ParseLandmark(JsonObject *root) { - // box - JsonArray * rootArray = json_object_get_array_member(root, "landmark"); - unsigned int elements = json_array_get_length(rootArray); + LOGI("ENTER"); - // TODO: handling error - for (unsigned int elem = 0; elem < elements; ++elem) { + name = + static_cast(json_object_get_string_member(root,"name")); + LOGI("layer: %s", name.c_str()); - JsonNode *pNode = json_array_get_element(rootArray, elem); - JsonObject *pObject = json_node_get_object(pNode); + JsonArray * array = json_object_get_array_member(root, "index"); + unsigned int elements2 = json_array_get_length(array); + LOGI("range dim: size[%u]", elements2); + for (unsigned int elem2 = 0; elem2 < elements2; ++elem2) { + if (static_cast(json_array_get_int_element(array, elem2)) == 1) + dimInfo.SetValidIndex(elem2); + } - name = - static_cast(json_object_get_string_member(pObject,"name")); - LOGI("layer: %s", name.c_str()); - - JsonArray * array = json_object_get_array_member(pObject, "index"); - unsigned int elements2 = json_array_get_length(array); - LOGI("range dim: size[%u]", elements2); - for (unsigned int elem2 = 0; elem2 < elements2; ++elem2) { - if (static_cast(json_array_get_int_element(array, elem2)) == 1) - dimInfo.SetValidIndex(elem2); - } - - try { - type = GetSupportedType(pObject, "landmark_type", supportedLandmarkTypes); - coordinate = GetSupportedType(pObject, "landmark_coordinate", supportedLandmarkCoordinateTypes); - decodingType = GetSupportedType(pObject, "decoding_type", supportedLandmarkDecodingTypes); - } catch (const std::exception& e) { - LOGE("Invalid %s", e.what()); - return MEDIA_VISION_ERROR_INVALID_OPERATION; - } - - offset = static_cast(json_object_get_int_member(pObject, "landmark_offset")); - LOGI("landmark offset: %d", offset); + try { + type = GetSupportedType(root, "landmark_type", supportedLandmarkTypes); + coordinate = GetSupportedType(root, "landmark_coordinate", supportedLandmarkCoordinateTypes); + decodingType = GetSupportedType(root, "decoding_type", supportedLandmarkDecodingTypes); + } catch (const std::exception& e) { + LOGE("Invalid %s", e.what()); + return MEDIA_VISION_ERROR_INVALID_OPERATION; } + offset = static_cast(json_object_get_int_member(root, "landmark_offset")); + LOGI("landmark offset: %d", offset); + LOGI("LEAVE"); return MEDIA_VISION_ERROR_NONE; } @@ -195,54 +186,36 @@ namespace inference { LOGI("ENTER"); - // box - JsonArray * rootArray = json_object_get_array_member(root, "landmark"); - unsigned int elements = json_array_get_length(rootArray); + JsonObject *cObject = json_object_get_object_member(root, "decoding_info"); + if (!json_object_has_member(cObject, "heatmap")) { + LOGE("heatmap is mandatory. Invalid metadata"); + LOGI("LEAVE"); - // TODO: handling error - for (unsigned int elem = 0; elem < elements; ++elem) { + return MEDIA_VISION_ERROR_INVALID_OPERATION; + } - JsonNode *pNode = json_array_get_element(rootArray, elem); - JsonObject *pObject = json_node_get_object(pNode); + JsonObject *object = json_object_get_object_member(cObject, "heatmap") ; + try { + GetHeatMapInfo().shapeType = GetSupportedType(object, "shape_type", supportedShapeType); + } catch (const std::exception& e) { + LOGE("Invalid %s", e.what()); + return MEDIA_VISION_ERROR_INVALID_OPERATION; + } + + std::vector heatMapIndexes = GetDimInfo().GetValidIndexAll(); + if (GetHeatMapInfo().shapeType == INFERENCE_TENSOR_SHAPE_NCHW) { + GetHeatMapInfo().cIdx = heatMapIndexes[0]; + GetHeatMapInfo().hIdx = heatMapIndexes[1]; + GetHeatMapInfo().wIdx = heatMapIndexes[2]; + } else { + GetHeatMapInfo().hIdx = heatMapIndexes[0]; + GetHeatMapInfo().wIdx = heatMapIndexes[1]; + GetHeatMapInfo().cIdx = heatMapIndexes[2]; + } - if (!json_object_has_member(pObject, "decoding_info")) { - LOGE("decoding_info is mandatory. Invalid metadata"); - LOGI("LEAVE"); - - return MEDIA_VISION_ERROR_INVALID_OPERATION; - } - - JsonObject *cObject = json_object_get_object_member(pObject, "decoding_info"); - if (!json_object_has_member(cObject, "heatmap")) { - LOGE("heatmap is mandatory. Invalid metadata"); - LOGI("LEAVE"); - - return MEDIA_VISION_ERROR_INVALID_OPERATION; - } - - JsonObject *object = json_object_get_object_member(cObject, "heatmap") ; - try { - GetHeatMapInfo().shapeType = GetSupportedType(object, "shape_type", supportedShapeType); - } catch (const std::exception& e) { - LOGE("Invalid %s", e.what()); - return MEDIA_VISION_ERROR_INVALID_OPERATION; - } - - std::vector heatMapIndexes = GetDimInfo().GetValidIndexAll(); - if (GetHeatMapInfo().shapeType == INFERENCE_TENSOR_SHAPE_NCHW) { - GetHeatMapInfo().cIdx = heatMapIndexes[0]; - GetHeatMapInfo().hIdx = heatMapIndexes[1]; - GetHeatMapInfo().wIdx = heatMapIndexes[2]; - } else { - GetHeatMapInfo().hIdx = heatMapIndexes[0]; - GetHeatMapInfo().wIdx = heatMapIndexes[1]; - GetHeatMapInfo().cIdx = heatMapIndexes[2]; - } - - if (json_object_has_member(object, "nms_radius")) { - GetHeatMapInfo().nmsRadius = static_cast(json_object_get_double_member(object, "nms_radius")); - LOGI("nms is enabled with %3.f", GetHeatMapInfo().nmsRadius ); - } + if (json_object_has_member(object, "nms_radius")) { + GetHeatMapInfo().nmsRadius = static_cast(json_object_get_double_member(object, "nms_radius")); + LOGI("nms is enabled with %3.f", GetHeatMapInfo().nmsRadius ); } LOGI("LEAVE"); diff --git a/mv_machine_learning/mv_inference/inference/include/OutputMetadata.h b/mv_machine_learning/mv_inference/inference/include/OutputMetadata.h index 9385aa7b..402b7e21 100644 --- a/mv_machine_learning/mv_inference/inference/include/OutputMetadata.h +++ b/mv_machine_learning/mv_inference/inference/include/OutputMetadata.h @@ -52,7 +52,7 @@ namespace inference bool parsed; ScoreInfo score; box::BoxInfo box; - Landmark landmark; + std::vector landmarks; OffsetVec offsetVec; std::map mSupportedShapeType; @@ -103,15 +103,16 @@ namespace inference DimInfo GetBoxNumberDimInfo() { return box.GetNumberDimInfo(); } int GetScoreCoordinate() { return box.GetCoordinate(); } - std::string GetLandmarkName() { return landmark.GetName(); } - int GetLandmarkOffset() { return landmark.GetOffset(); } - inference_landmark_type_e GetLandmarkType() { return landmark.GetType(); } - DimInfo GetLandmarkDimInfo() { return landmark.GetDimInfo(); } - HeatMapInfo& GetLandmarkHeatMapInfo() { return landmark.GetHeatMapInfo(); } - inference_landmark_coorindate_type_e GetLandmarkCoordinate() { return landmark.GetCoordinate(); } - inference_landmark_decoding_type_e GetLandmarkDecodingType() { return landmark.GetDecodingType(); } - std::vector& GetLandmarkDispVecAll() { return landmark.GetDispVecAll(); } - std::vector>& GetLandmarkEdges() { return landmark.GetEdges(); } + size_t GetLandmarksSize() { return landmarks.size(); } + std::string GetLandmarkName(int idx = 0) { return landmarks[idx].GetName(); } + int GetLandmarkOffset(int idx = 0) { return landmarks[idx].GetOffset(); } + inference_landmark_type_e GetLandmarkType(int idx = 0) { return landmarks[idx].GetType(); } + DimInfo GetLandmarkDimInfo(int idx = 0) { return landmarks[idx].GetDimInfo(); } + HeatMapInfo& GetLandmarkHeatMapInfo(int idx = 0) { return landmarks[idx].GetHeatMapInfo(); } + inference_landmark_coorindate_type_e GetLandmarkCoordinate(int idx = 0) { return landmarks[idx].GetCoordinate(); } + inference_landmark_decoding_type_e GetLandmarkDecodingType(int idx = 0) { return landmarks[idx].GetDecodingType(); } + std::vector& GetLandmarkDispVecAll(int idx = 0) { return landmarks[idx].GetDispVecAll(); } + std::vector>& GetLandmarkEdges(int idx = 0) { return landmarks[idx].GetEdges(); } std::string GetOffsetVecName() { return offsetVec.GetName(); } inference_box_decoding_type_e GetBoxDecodingType() { return box.GetDecodingType(); } }; diff --git a/mv_machine_learning/mv_inference/inference/include/PoseDecoder.h b/mv_machine_learning/mv_inference/inference/include/PoseDecoder.h index aaeb48ea..c1ea4932 100644 --- a/mv_machine_learning/mv_inference/inference/include/PoseDecoder.h +++ b/mv_machine_learning/mv_inference/inference/include/PoseDecoder.h @@ -45,6 +45,7 @@ namespace inference int mHeatMapHeight; int mHeatMapChannel; int mNumberOfLandmarks; + int mIdx; std::list mCandidates; std::vector mPoseLandmarks; @@ -66,12 +67,13 @@ namespace inference public: PoseDecoder(TensorBuffer& buffer, const OutputMetadata& metaData, int heatMapWidth, int heatMapHeight, int heatMapChannel, - int numberOfLandmarks) : + int numberOfLandmarks, int idx = 0) : mTensorBuffer(buffer), mHeatMapWidth(heatMapWidth), mHeatMapHeight(heatMapHeight), mHeatMapChannel(heatMapChannel), - mNumberOfLandmarks(numberOfLandmarks) { + mNumberOfLandmarks(numberOfLandmarks), + mIdx(idx) { mMeta = metaData; }; diff --git a/mv_machine_learning/mv_inference/inference/src/Inference.cpp b/mv_machine_learning/mv_inference/inference/src/Inference.cpp index 2a43efdd..31d2a9c0 100755 --- a/mv_machine_learning/mv_inference/inference/src/Inference.cpp +++ b/mv_machine_learning/mv_inference/inference/src/Inference.cpp @@ -463,15 +463,14 @@ namespace inference if (!outputMeta.GetBoxNumberName().empty()) mConfig.mOutputLayerNames.push_back(outputMeta.GetBoxNumberName()); - if (!outputMeta.GetLandmarkName().empty()) - mConfig.mOutputLayerNames.push_back(outputMeta.GetLandmarkName()); + for (int idx = 0; idx < outputMeta.GetLandmarksSize(); idx++) { + mConfig.mOutputLayerNames.push_back(outputMeta.GetLandmarkName(idx)); + for (auto& dispVec : outputMeta.GetLandmarkDispVecAll(idx)) + mConfig.mOutputLayerNames.push_back(dispVec.GetName()); + } if (!outputMeta.GetOffsetVecName().empty()) mConfig.mOutputLayerNames.push_back(outputMeta.GetOffsetVecName()); - - for (auto& dispVec : outputMeta.GetLandmarkDispVecAll()) { - mConfig.mOutputLayerNames.push_back(dispVec.GetName()); - } } inference_engine_layer_property property; @@ -1532,56 +1531,59 @@ namespace inference return MEDIA_VISION_ERROR_INVALID_OPERATION; } - int heatMapWidth = 0; - int heatMapHeight = 0; - int heatMapChannel = 0; - std::vector channelIndexes = outputMeta.GetLandmarkDimInfo().GetValidIndexAll(); - int number_of_landmarks = heatMapChannel; - - if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) { - LOGI("landmark dim size: %zd and idx[0] is %d", channelIndexes.size(), channelIndexes[0]); - number_of_landmarks = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]] - / outputMeta.GetLandmarkOffset(); - } else if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) { - number_of_landmarks = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[channelIndexes[0]]; - } else { - heatMapWidth = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[outputMeta.GetLandmarkHeatMapInfo().wIdx]; - heatMapHeight = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[outputMeta.GetLandmarkHeatMapInfo().hIdx]; - heatMapChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName()].shape[outputMeta.GetLandmarkHeatMapInfo().cIdx]; - } + for (int idx = 0; idx < outputMeta.GetLandmarksSize(); idx++) { + LOGE("decode: %s", outputMeta.GetLandmarkName(idx).c_str()); + int heatMapWidth = 0; + int heatMapHeight = 0; + int heatMapChannel = 0; + std::vector channelIndexes = outputMeta.GetLandmarkDimInfo(idx).GetValidIndexAll(); + int number_of_landmarks = heatMapChannel; + + if (outputMeta.GetLandmarkDecodingType(idx) == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) { + LOGI("landmark dim size: %zd and idx[0] is %d", channelIndexes.size(), channelIndexes[0]); + number_of_landmarks = mOutputLayerProperty.layers[outputMeta.GetLandmarkName(idx)].shape[channelIndexes[0]] + / outputMeta.GetLandmarkOffset(idx); + } else if (outputMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) { + number_of_landmarks = mOutputLayerProperty.layers[outputMeta.GetLandmarkName(idx)].shape[channelIndexes[0]]; + } else { + heatMapWidth = mOutputLayerProperty.layers[outputMeta.GetLandmarkName(idx)].shape[outputMeta.GetLandmarkHeatMapInfo(idx).wIdx]; + heatMapHeight = mOutputLayerProperty.layers[outputMeta.GetLandmarkName(idx)].shape[outputMeta.GetLandmarkHeatMapInfo(idx).hIdx]; + heatMapChannel = mOutputLayerProperty.layers[outputMeta.GetLandmarkName(idx)].shape[outputMeta.GetLandmarkHeatMapInfo(idx).cIdx]; + } - LOGI("heatMap: w[%d], h[%d], c[%d]", heatMapWidth, heatMapHeight, heatMapChannel); + LOGI("heatMap: w[%d], h[%d], c[%d]", heatMapWidth, heatMapHeight, heatMapChannel); - // decoding - PoseDecoder poseDecoder(mOutputTensorBuffers, outputMeta, - heatMapWidth, heatMapHeight, heatMapChannel, - number_of_landmarks); + // decoding + PoseDecoder poseDecoder(mOutputTensorBuffers, outputMeta, + heatMapWidth, heatMapHeight, heatMapChannel, + number_of_landmarks, idx); - // initialize decorder queue with landmarks to be decoded. - int ret = poseDecoder.init(); - if (ret != MEDIA_VISION_ERROR_NONE) { - LOGE("Fail to init poseDecoder"); - return ret; - } + // initialize decorder queue with landmarks to be decoded. + int ret = poseDecoder.init(); + if (ret != MEDIA_VISION_ERROR_NONE) { + LOGE("Fail to init poseDecoder"); + return ret; + } - float inputW = 1.f; - float inputH = 1.f; + float inputW = 1.f; + float inputH = 1.f; - if (outputMeta.GetLandmarkCoordinate() == INFERENCE_LANDMARK_COORDINATE_TYPE_PIXEL) { - inputW = static_cast(mMetadata.GetInputMeta().GetLayer().begin()->second.GetWidth()); - inputH = static_cast(mMetadata.GetInputMeta().GetLayer().begin()->second.GetHeight()); - } + if (outputMeta.GetLandmarkCoordinate(idx) == INFERENCE_LANDMARK_COORDINATE_TYPE_PIXEL) { + inputW = static_cast(mMetadata.GetInputMeta().GetLayer().begin()->second.GetWidth()); + inputH = static_cast(mMetadata.GetInputMeta().GetLayer().begin()->second.GetHeight()); + } - float thresRadius = outputMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE ? 0.0 : - outputMeta.GetLandmarkHeatMapInfo().nmsRadius; + float thresRadius = outputMeta.GetLandmarkType(idx) == INFERENCE_LANDMARK_TYPE_2D_SINGLE ? 0.0 : + outputMeta.GetLandmarkHeatMapInfo(idx).nmsRadius; - poseDecoder.decode(inputW, inputH, thresRadius); - LOGE("imgW:%d, imgH:%d", mSourceSize.width, mSourceSize.height); - for (int landmarkIndex = 0; landmarkIndex < number_of_landmarks; landmarkIndex++) { - results->locations.push_back( - cv::Point3f(poseDecoder.getPointX(0, landmarkIndex) * static_cast(mSourceSize.width), - poseDecoder.getPointY(0, landmarkIndex) * static_cast(mSourceSize.height), - poseDecoder.getPointZ(0, landmarkIndex))); + poseDecoder.decode(inputW, inputH, thresRadius); + LOGE("imgW:%d, imgH:%d", mSourceSize.width, mSourceSize.height); + for (int landmarkIndex = 0; landmarkIndex < number_of_landmarks; landmarkIndex++) { + results->locations.push_back( + cv::Point3f(poseDecoder.getPointX(0, landmarkIndex) * static_cast(mSourceSize.width), + poseDecoder.getPointY(0, landmarkIndex) * static_cast(mSourceSize.height), + poseDecoder.getPointZ(0, landmarkIndex))); + } } results->number_of_landmarks = results->locations.size(); diff --git a/mv_machine_learning/mv_inference/inference/src/OutputMetadata.cpp b/mv_machine_learning/mv_inference/inference/src/OutputMetadata.cpp index 391b265c..dd0c1219 100755 --- a/mv_machine_learning/mv_inference/inference/src/OutputMetadata.cpp +++ b/mv_machine_learning/mv_inference/inference/src/OutputMetadata.cpp @@ -35,7 +35,7 @@ namespace inference parsed(false), score(), box(), - landmark(), + landmarks(), offsetVec() { // shape_type @@ -389,10 +389,49 @@ namespace inference return MEDIA_VISION_ERROR_NONE; } - landmark.ParseLandmark(root); + int ret = MEDIA_VISION_ERROR_NONE; + JsonArray * rootArray = json_object_get_array_member(root, "landmark"); + unsigned int elements = json_array_get_length(rootArray); + for (unsigned int elem = 0; elem < elements; ++elem) { + JsonNode *pNode = json_array_get_element(rootArray, elem); + JsonObject *pObject = json_node_get_object(pNode); + Landmark lmark; + lmark.ParseLandmark(pObject); + + if (lmark.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP || + lmark.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) { + ret = lmark.ParseDecodeInfo(pObject, mSupportedShapeType); + if (ret != MEDIA_VISION_ERROR_NONE) { + LOGE("Fail to GetLandmarkDecodeInfo[%d]", ret); + return ret; + } + } + + if (lmark.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) { + ret = ParseOffset(root); + if (ret != MEDIA_VISION_ERROR_NONE) { + LOGE("Fail to GetOffsetVector[%d]", ret); + return ret; + } + + ret = lmark.ParseDisplacement(root, mSupportedShapeType); + if (ret != MEDIA_VISION_ERROR_NONE) { + LOGE("Fail to GetDispVector[%d]", ret); + return ret; + } + + ret = lmark.ParseEdgeMap(root); + if (ret != MEDIA_VISION_ERROR_NONE) { + LOGE("Fail to GetEdgeConnection[%d]", ret); + return ret; + } + } + + landmarks.push_back(lmark); + } LOGI("LEAVE"); - return MEDIA_VISION_ERROR_NONE; + return ret; } int OutputMetadata::ParseOffset(JsonObject *root) @@ -467,37 +506,6 @@ namespace inference return ret; } - if (!landmark.GetName().empty()) { - if (landmark.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP || - landmark.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) { - ret = landmark.ParseDecodeInfo(root, mSupportedShapeType); - if (ret != MEDIA_VISION_ERROR_NONE) { - LOGE("Fail to GetLandmarkDecodeInfo[%d]", ret); - return ret; - } - } - - if (landmark.GetDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) { - ret = ParseOffset(root); - if (ret != MEDIA_VISION_ERROR_NONE) { - LOGE("Fail to GetOffsetVector[%d]", ret); - return ret; - } - - ret = landmark.ParseDisplacement(root, mSupportedShapeType); - if (ret != MEDIA_VISION_ERROR_NONE) { - LOGE("Fail to GetDispVector[%d]", ret); - return ret; - } - - ret = landmark.ParseEdgeMap(root); - if (ret != MEDIA_VISION_ERROR_NONE) { - LOGE("Fail to GetEdgeConnection[%d]", ret); - return ret; - } - } - } - parsed = true; LOGI("LEAVE"); diff --git a/mv_machine_learning/mv_inference/inference/src/PoseDecoder.cpp b/mv_machine_learning/mv_inference/inference/src/PoseDecoder.cpp index 0cadd8a7..bc124d06 100644 --- a/mv_machine_learning/mv_inference/inference/src/PoseDecoder.cpp +++ b/mv_machine_learning/mv_inference/inference/src/PoseDecoder.cpp @@ -50,8 +50,8 @@ namespace inference { LOGI("ENTER"); - if (mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS || - mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) { + if (mMeta.GetLandmarkDecodingType(mIdx) == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS || + mMeta.GetLandmarkDecodingType(mIdx) == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) { LOGI("Skip init"); return MEDIA_VISION_ERROR_NONE; } @@ -64,8 +64,8 @@ namespace inference mCandidates.clear(); - if (mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE || - mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_3D_SINGLE) { + if (mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_2D_SINGLE || + mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_3D_SINGLE) { mCandidates.resize(mHeatMapChannel); } @@ -83,8 +83,8 @@ namespace inference if (score < mMeta.GetScoreThreshold()) continue; - if (mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE || - mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_3D_SINGLE) { + if (mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_2D_SINGLE || + mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_3D_SINGLE) { if (score <= candidate->score) continue; @@ -119,7 +119,7 @@ namespace inference continue; // add this to list - LOGI("[%d x %d][%d]: score %.3f", y, x, c, score); + //LOGI("[%d x %d][%d]: score %.3f", y, x, c, score); std::list::iterator iter; for (iter = mCandidates.begin(); iter != mCandidates.end(); ++iter) { if ((*iter).score < score) { @@ -239,27 +239,27 @@ namespace inference LandmarkPoint initValue = {0.0f, cv::Point(0,0), cv::Point3f(0.0f, 0.0f, 0.0f), -1, false}; - if (mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE || - mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_3D_SINGLE) { + if (mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_2D_SINGLE || + mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_3D_SINGLE) { mPoseLandmarks.resize(1); - if (mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS || - mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) { + if (mMeta.GetLandmarkDecodingType(mIdx) == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS || + mMeta.GetLandmarkDecodingType(mIdx) == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) { mPoseLandmarks[0].landmarks.resize(mNumberOfLandmarks); } else { mPoseLandmarks[0].landmarks.resize(mHeatMapChannel); } } - if (mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP || - mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) { + if (mMeta.GetLandmarkDecodingType(mIdx) == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP || + mMeta.GetLandmarkDecodingType(mIdx) == INFERENCE_LANDMARK_DECODING_TYPE_HEATMAP_REFINE) { while (!mCandidates.empty()) { LandmarkPoint &root = mCandidates.front(); getIndexToPos(root, scaleWidth, scaleHeight); - if (mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE) { + if (mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_2D_SINGLE) { root.valid = true; mPoseLandmarks[0].landmarks[root.id] = root; mPoseLandmarks[0].score += root.score; @@ -311,11 +311,11 @@ namespace inference for (auto& pose : mPoseLandmarks) { pose.score /= static_cast(mHeatMapChannel); } - } else if (mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) { - int landmarkOffset = mMeta.GetLandmarkOffset(); + } else if (mMeta.GetLandmarkDecodingType(mIdx) == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS_MULTICHANNEL) { + int landmarkOffset = mMeta.GetLandmarkOffset(mIdx); for (int idx = 0; idx < mNumberOfLandmarks; ++idx) { - float py = mTensorBuffer.getValue(mMeta.GetLandmarkName(), idx * landmarkOffset); - float px = mTensorBuffer.getValue(mMeta.GetLandmarkName(), idx * landmarkOffset + 1); + float py = mTensorBuffer.getValue(mMeta.GetLandmarkName(mIdx), idx * landmarkOffset); + float px = mTensorBuffer.getValue(mMeta.GetLandmarkName(mIdx), idx * landmarkOffset + 1); float pscore = mTensorBuffer.getValue(mMeta.GetScoreName(), idx * landmarkOffset + 2); mPoseLandmarks[0].landmarks[idx].score = pscore; @@ -351,19 +351,23 @@ namespace inference } } - int landmarkOffset = (mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_SINGLE || - mMeta.GetLandmarkType() == INFERENCE_LANDMARK_TYPE_2D_MULTI) ? 2 : 3; + int landmarkOffset = (mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_2D_SINGLE || + mMeta.GetLandmarkType(mIdx) == INFERENCE_LANDMARK_TYPE_2D_MULTI) ? 2 : 3; if (mMeta.GetLandmarkDecodingType() == INFERENCE_LANDMARK_DECODING_TYPE_BYPASS) { - landmarkOffset = mMeta.GetLandmarkOffset(); + landmarkOffset = mMeta.GetLandmarkOffset(mIdx); } LOGE("landmark count : %d", mNumberOfLandmarks); LOGE("landmark offset: %d", landmarkOffset); LOGE("scale width x height: %.3fx%.3f", scaleWidth, scaleHeight); for (int idx = 0; idx < mNumberOfLandmarks; ++idx) { - float px = mTensorBuffer.getValue(mMeta.GetLandmarkName(), idx * landmarkOffset); - float py = landmarkOffset >= 2 ? mTensorBuffer.getValue(mMeta.GetLandmarkName(), idx * landmarkOffset + 1) : 0.0f; - float pz = landmarkOffset >= 3 ? mTensorBuffer.getValue(mMeta.GetLandmarkName(), idx * landmarkOffset + 2) : 0.0f; + float px = mTensorBuffer.getValue(mMeta.GetLandmarkName(mIdx), idx * landmarkOffset); + float py = landmarkOffset >= 2 ? + mTensorBuffer.getValue(mMeta.GetLandmarkName(mIdx), idx * landmarkOffset + 1) : + 0.0f; + float pz = landmarkOffset >= 3 ? + mTensorBuffer.getValue(mMeta.GetLandmarkName(mIdx), idx * landmarkOffset + 2) : + 0.0f; mPoseLandmarks[0].landmarks[idx].score = landmarkOffset < 5 ? poseScore : mTensorBuffer.getValue(mMeta.GetLandmarkName(), idx * landmarkOffset + 4); mPoseLandmarks[0].landmarks[idx].heatMapLoc = cv::Point(-1, -1); diff --git a/test/testsuites/machine_learning/inference/mv_facestream_test_suite.cpp b/test/testsuites/machine_learning/inference/mv_facestream_test_suite.cpp index f8086a0c..51394cf5 100644 --- a/test/testsuites/machine_learning/inference/mv_facestream_test_suite.cpp +++ b/test/testsuites/machine_learning/inference/mv_facestream_test_suite.cpp @@ -41,8 +41,10 @@ #define MAX_STRING_LENGTH 1024 #define ARRAY_SIZE(x) (sizeof((x)) / sizeof((x)[0])) #define MAX_FRAMES 1800 // 30 fps * 60s -#define FLD_MODEL_PATH "/usr/share/capi-media-vision/models/FLD/tflite/fld_mediapipe_192x192.tflite" -#define FLD_META_PATH "/usr/share/capi-media-vision/models/FLD/tflite/fld_mediapipe_192x192.json" +//#define FLD_MODEL_PATH "/usr/share/capi-media-vision/models/FLD/tflite/fld_mediapipe_192x192.tflite" +//#define FLD_META_PATH "/usr/share/capi-media-vision/models/FLD/tflite/fld_mediapipe_192x192.json" +#define FLD_MODEL_PATH "/usr/share/capi-media-vision/models/FLD/tflite/face_landmark_with_attention.tflite" +#define FLD_META_PATH "/usr/share/capi-media-vision/models/FLD/tflite/face_landmark_with_attention.json" #define FD_MODEL_PATH "/usr/share/capi-media-vision/models/FD/tflite/fd_blazeface_front_128x128.tflite" #define FD_META_PATH "/usr/share/capi-media-vision/models/FD/tflite/fd_blazeface_front_128x128.json" #define FD_LABEL_PATH "/usr/share/capi-media-vision/models/FD/tflite/fd_blazeface_front_label.txt" @@ -98,6 +100,7 @@ typedef struct _appdata { int flandmark_num; int numFrame; CairoOverlayState *overlay_state; + int input; } Appdata; @@ -122,7 +125,7 @@ GstElement *pipeline; // Gstreamer - camera src GstElement *facecam, *source, *filter, *vconv, *tee; -GstElement *sdec, *sscale; +GstElement *sdec, *sscale, *srate; GstElement *queue1, *queue2, *queue3; GstElement *vscale, *vsfilter; @@ -198,7 +201,7 @@ static gboolean bus_call (GstBus *bus, GstMessage *msg, gpointer data) static void cairo_overlay_handler(GstElement *overlay, cairo_t *cr, guint64 timestamp, guint64 duration, gpointer user_data) { - printf("cairo_overlay_handler\n"); + //printf("cairo_overlay_handler\n"); Appdata *appdata = static_cast(user_data); CairoOverlayState *s = (CairoOverlayState *)appdata->overlay_state; if (!s->valid) @@ -212,12 +215,10 @@ static void cairo_overlay_handler(GstElement *overlay, cairo_t *cr, guint64 time cairo_arc(cr, faceSkeleton.fLmark[pt].x, faceSkeleton.fLmark[pt].y, LD_RADIUS, LD_START_ANGLE, LD_END_ANGLE); cairo_fill(cr); } - /* cairo_rectangle(cr, faceSkeleton.fRoi.point.x, faceSkeleton.fRoi.point.y, faceSkeleton.fRoi.width, faceSkeleton.fRoi.height); - */ } cairo_stroke(cr); @@ -251,16 +252,32 @@ static void _facial_landmark_cb(mv_source_h source, float smoothingCoeff = 0.2f; float maxAlpha = 0.8f; + unsigned int width, height, bufferSize; + unsigned char *buffer = nullptr; + mv_source_get_width(source, &width); + mv_source_get_height(source, &height); + mv_source_get_buffer(source, &buffer, &bufferSize); + + cv::Mat result(cv::Size(width, height), CV_8UC3, buffer); for (int pt=0; pt < landmarks; pt++) { x = static_cast(locations[pt].x) / 192.f * static_cast(faceSkeleton.fRoi.width); y = static_cast(locations[pt].y) / 192.f * static_cast(faceSkeleton.fRoi.height); faceSkeleton.fLmark[pt].x = static_cast(x) + faceSkeleton.fRoi.point.x; faceSkeleton.fLmark[pt].y = static_cast(y) + faceSkeleton.fRoi.point.y; faceSkeleton.fLmark[pt].z = locations[pt].z; + + // 0 ~ 79: lips + // 80 ~ 150: left eye + // 151 ~ 221: right eye + cv::circle(result, cv::Point(locations[pt].x, locations[pt].y), 1, pt < 222 ? cv::Scalar(0,255,0) : cv::Scalar(255,0,0)); + + /* printf("%d: x[%d], y[%d], z[%f]\n", pt, faceSkeleton.fLmark[pt].x, faceSkeleton.fLmark[pt].y, faceSkeleton.fLmark[pt].z); + */ } + cv::imwrite("/tmp/result.png", result); } static gboolean @@ -375,9 +392,14 @@ static void fd_handoff(GstElement *object, GstBuffer *buffer, GstPad *pad, gpoin } int createPipelineCam(Appdata& appdata) { - source = gst_element_factory_make("v4l2src", "src"); + if (appdata.input == 0) { + source = gst_element_factory_make("v4l2src", "src"); + } else { + source = gst_element_factory_make("multifilesrc", "src"); + } sdec = gst_element_factory_make("jpegdec", "sdec"); sscale = gst_element_factory_make("videoscale", "sscale"); + srate = gst_element_factory_make("videorate", "srate"); filter = gst_element_factory_make("capsfilter", "filter"); @@ -391,7 +413,7 @@ int createPipelineCam(Appdata& appdata) vsfilter = gst_element_factory_make("capsfilter", "vsfilter"); vconv = gst_element_factory_make("videoconvert", "convert"); vcfilter = gst_element_factory_make("capsfilter", "vcfilter"); - vrate = gst_element_factory_make("videorate", "rate"); + vrate = gst_element_factory_make("videorate", "vrate"); vrfilter = gst_element_factory_make("capsfilter", "vrfilter"); vrsink = gst_element_factory_make("fakesink", "vrsink"); @@ -409,7 +431,7 @@ int createPipelineCam(Appdata& appdata) vcrscfilter = gst_element_factory_make("capsfilter", "vcrscfilter"); vcrssink = gst_element_factory_make("fakesink", "vcrssink"); - if (!facecam || !source || !filter || !sdec || !sscale || + if (!facecam || !source || !filter || !sdec || !sscale || !srate || !tee || !queue1 || !vscale || !vsfilter || !vconv || !vcfilter || !vrate || !vrfilter || !vrsink || !queue2 || !oconv || !coverlay || !sink || !sink2 || @@ -421,7 +443,13 @@ int createPipelineCam(Appdata& appdata) g_signal_connect(coverlay, "draw", G_CALLBACK(cairo_overlay_handler), &appdata); g_signal_connect(coverlay, "caps-changed", G_CALLBACK (prepare_overlay), &appdata); - g_object_set(G_OBJECT(source), "device", "/dev/video0", NULL); + if (appdata.input == 0) { + g_object_set(G_OBJECT(source), "device", "/dev/video2", NULL); + } else { + g_object_set(G_OBJECT(source), "location", "/tmp/sample.jpg", NULL); + g_object_set(G_OBJECT(source), "loop", TRUE, NULL); + } + g_object_set(G_OBJECT(sink2), "use-tbm", FALSE, NULL); g_object_set(G_OBJECT(sink2), "sync", FALSE, NULL); g_object_set(G_OBJECT(sink), "video-sink", sink2, NULL); @@ -453,14 +481,17 @@ int createPipelineCam(Appdata& appdata) gst_bin_add_many(GST_BIN(facecam), - source, sdec, sscale, filter, + source, sdec, sscale, srate, filter, tee, queue1, vscale, vsfilter, vconv, vcfilter, vrate, vrfilter, vrsink, queue2, oconv, coverlay, sink, queue3, vcrop, vcrscale, vcrsfilter, vcrsconv, vcrscfilter, vcrssink, NULL); /* link elements */ - gst_element_link_many(source, sdec, sscale, filter, tee, NULL); + if (appdata.input == 0 ) + gst_element_link_many(source, /*sdec,*/ sscale, filter, tee, NULL); + else + gst_element_link_many(source, sdec, sscale, srate, filter, tee, NULL); // pose gst_element_link_many (tee, queue3, vcrop, vcrscale, vcrsfilter, vcrsconv, vcrscfilter, vcrssink, NULL); // display @@ -490,6 +521,11 @@ int main(int argc, char *argv[]) appdata.numFrame = 0; appdata.flandmark_num = 0; appdata.overlay_state = g_new0(CairoOverlayState, 1); + if (argc == 2) + appdata.input = atoi(argv[1]); // 0: gst camera, 1: gst image file + else + appdata.input = 0; + int ret = MEDIA_VISION_ERROR_NONE; printf("enter main\n"); -- cgit v1.2.3