diff options
Diffstat (limited to 'eng/common/templates/steps/telemetry-end.yml')
-rw-r--r-- | eng/common/templates/steps/telemetry-end.yml | 87 |
1 files changed, 63 insertions, 24 deletions
diff --git a/eng/common/templates/steps/telemetry-end.yml b/eng/common/templates/steps/telemetry-end.yml index 9b61481e7e..fadc04ca1b 100644 --- a/eng/common/templates/steps/telemetry-end.yml +++ b/eng/common/templates/steps/telemetry-end.yml @@ -1,3 +1,7 @@ +parameters: + maxRetries: 5 + retryDelay: 10 # in seconds + steps: - bash: | if [ "$AGENT_JOBSTATUS" = "Succeeded" ] || [ "$AGENT_JOBSTATUS" = "PartiallySucceeded" ]; then @@ -7,27 +11,41 @@ steps: fi warningCount=0 - # create a temporary file for curl output - res=`mktemp` - - curlResult=` - curl --verbose --output $res --write-out "%{http_code}"\ - -H 'Content-Type: application/json' \ - -H "X-Helix-Job-Token: $Helix_JobToken" \ - -H 'Content-Length: 0' \ - -X POST -G "https://helix.dot.net/api/2018-03-14/telemetry/job/build/$Helix_WorkItemId/finish" \ - --data-urlencode "errorCount=$errorCount" \ - --data-urlencode "warningCount=$warningCount"` - curlStatus=$? - - if [ $curlStatus -eq 0 ]; then - if [ $curlResult -gt 299 ] || [ $curlResult -lt 200 ]; then - curlStatus=$curlResult + curlStatus=1 + retryCount=0 + # retry loop to harden against spotty telemetry connections + # we don't retry successes and 4xx client errors + until [[ $curlStatus -eq 0 || ( $curlStatus -ge 400 && $curlStatus -le 499 ) || $retryCount -ge $MaxRetries ]] + do + if [ $retryCount -gt 0 ]; then + echo "Failed to send telemetry to Helix; waiting $RetryDelay seconds before retrying..." + sleep $RetryDelay fi - fi + + # create a temporary file for curl output + res=`mktemp` + + curlResult=` + curl --verbose --output $res --write-out "%{http_code}"\ + -H 'Content-Type: application/json' \ + -H "X-Helix-Job-Token: $Helix_JobToken" \ + -H 'Content-Length: 0' \ + -X POST -G "https://helix.dot.net/api/2018-03-14/telemetry/job/build/$Helix_WorkItemId/finish" \ + --data-urlencode "errorCount=$errorCount" \ + --data-urlencode "warningCount=$warningCount"` + curlStatus=$? + + if [ $curlStatus -eq 0 ]; then + if [ $curlResult -gt 299 ] || [ $curlResult -lt 200 ]; then + curlStatus=$curlResult + fi + fi + + let retryCount++ + done if [ $curlStatus -ne 0 ]; then - echo "Failed to Send Build Finish information" + echo "Failed to Send Build Finish information after $retryCount retries" vstsLogOutput="vso[task.logissue type=error;sourcepath=templates/steps/telemetry-end.yml;code=1;]Failed to Send Build Finish information: $curlStatus" echo "##$vstsLogOutput" exit 1 @@ -37,6 +55,8 @@ steps: # defined via VSTS variables in start-job.sh Helix_JobToken: $(Helix_JobToken) Helix_WorkItemId: $(Helix_WorkItemId) + MaxRetries: ${{ parameters.maxRetries }} + RetryDelay: ${{ parameters.retryDelay }} condition: and(always(), ne(variables['Agent.Os'], 'Windows_NT')) - powershell: | if (($env:Agent_JobStatus -eq 'Succeeded') -or ($env:Agent_JobStatus -eq 'PartiallySucceeded')) { @@ -46,13 +66,30 @@ steps: } $WarningCount = 0 - try { - Invoke-RestMethod -Uri "https://helix.dot.net/api/2018-03-14/telemetry/job/build/$env:Helix_WorkItemId/finish?errorCount=$ErrorCount&warningCount=$WarningCount" -Method Post -ContentType "application/json" -Body "" ` - -Headers @{ 'X-Helix-Job-Token'=$env:Helix_JobToken } + # Basic retry loop to harden against server flakiness + $retryCount = 0 + while ($retryCount -lt $env:MaxRetries) { + try { + Invoke-RestMethod -Uri "https://helix.dot.net/api/2018-03-14/telemetry/job/build/$env:Helix_WorkItemId/finish?errorCount=$ErrorCount&warningCount=$WarningCount" -Method Post -ContentType "application/json" -Body "" ` + -Headers @{ 'X-Helix-Job-Token'=$env:Helix_JobToken } + break + } + catch { + $statusCode = $_.Exception.Response.StatusCode.value__ + if ($statusCode -ge 400 -and $statusCode -le 499) { + Write-Host "##vso[task.logissue]error Failed to send telemetry to Helix (status code $statusCode); not retrying (4xx client error)" + Write-Host "##vso[task.logissue]error ", $_.Exception.GetType().FullName, $_.Exception.Message + exit 1 + } + Write-Host "Failed to send telemetry to Helix (status code $statusCode); waiting $env:RetryDelay seconds before retrying..." + $retryCount++ + sleep $env:RetryDelay + continue + } } - catch { - Write-Error $_ - Write-Error $_.Exception + + if ($retryCount -ge $env:MaxRetries) { + Write-Host "##vso[task.logissue]error Failed to send telemetry to Helix after $retryCount retries." exit 1 } displayName: Send Windows Build End Telemetry @@ -60,4 +97,6 @@ steps: # defined via VSTS variables in start-job.ps1 Helix_JobToken: $(Helix_JobToken) Helix_WorkItemId: $(Helix_WorkItemId) + MaxRetries: ${{ parameters.maxRetries }} + RetryDelay: ${{ parameters.retryDelay }} condition: and(always(),eq(variables['Agent.Os'], 'Windows_NT')) |