11 files changed, 1538 insertions, 14 deletions
diff --git a/Documentation/botr/clr-abi.md b/Documentation/botr/clr-abi.md
index caa5c7add9..1a6034467f 100644
--- a/Documentation/botr/clr-abi.md
+++ b/Documentation/botr/clr-abi.md
@@ -149,7 +149,9 @@ For x86, all handlers are generated within the method body, typically in lexical
 
 JIT64 attempts to speed the normal control flow by 'inlining' a called finally along the 'normal' control flow (i.e., leaving a try body in a non-exceptional manner via C# fall-through). Because the VM semantics for non-rude Thread.Abort dictate that handlers will not be aborted, the JIT must mark these 'inlined' finally bodies. These show up as special entries at the end of the EH tables and are marked with `COR_ILEXCEPTION_CLAUSE_FINALLY | COR_ILEXCEPTION_CLAUSE_DUPLICATED`, and the try_start, try_end, and handler_start are all the same: the start of the cloned finally.
 
-JIT32 and RyuJIT currently do not implement finally cloning.
+RyuJit also implements finally cloning, for all supported architectures. However, the implementation does not yet handle the thread abort case; cloned finally bodies are not guaranteed to remain intact and are not reported to the runtime. Because of this, finally cloning is disabled for VMs that support thread abort (desktop clr).
+
+JIT32 does not implement finally cloning.
 
 ## Invoking Finallys/Non-local exits
 
diff --git a/Documentation/design-docs/finally-optimizations.md b/Documentation/design-docs/finally-optimizations.md
new file mode 100644
index 0000000000..ddc6dbd93c
--- /dev/null
+++ b/Documentation/design-docs/finally-optimizations.md
@@ -0,0 +1,451 @@
+Finally Optimizations
+=====================
+
+In MSIL, a try-finally is a construct where a block of code
+(the finally) is guaranteed to be executed after control leaves a
+protected region of code (the try) either normally or via an
+exception.
+
+In RyuJit a try-finally is currently implemented by transforming the
+finally into a local function that is invoked via jitted code at normal
+exits from the try block and is invoked via the runtime for exceptional
+exits from the try block.
+
+For x86 the local function is simply a part of the method and shares
+the same stack frame with the method. For other architectures the
+local function is promoted to a potentially separable "funclet"
+which is almost like a regular function with a prolog and epilog. A
+custom calling convention gives the funclet access to the parent stack
+frame.
+
+In this proposal we outline two optimizations for finallys: removing
+empty finallys and finally cloning.
+
+Empty Finally Removal
+---------------------
+
+An empty finally is one that has no observable effect. These often
+arise from `foreach` or `using` constructs (which induce a
+try-finally) where the cleanup method called in the finally does
+nothing. Often, after inlining, the empty finally is readily apparent.
+
+For example, this snippet of C# code
+```C#
+static int Sum(List<int> x) {
+    int sum = 0;
+    foreach(int i in x) {
+        sum += i;
+    }
+    return sum;
+}
+```
+produces the following jitted code:
+```asm
+; Successfully inlined Enumerator[Int32][System.Int32]:Dispose():this
+;    (1 IL bytes) (depth 1) [below ALWAYS_INLINE size]
+G_M60484_IG01:
+       55                   push     rbp
+       57                   push     rdi
+       56                   push     rsi
+       4883EC50             sub      rsp, 80
+       488D6C2460           lea      rbp, [rsp+60H]
+       488BF1               mov      rsi, rcx
+       488D7DD0             lea      rdi, [rbp-30H]
+       B906000000           mov      ecx, 6
+       33C0                 xor      rax, rax
+       F3AB                 rep stosd
+       488BCE               mov      rcx, rsi
+       488965C0             mov      qword ptr [rbp-40H], rsp
+
+G_M60484_IG02:
+       33C0                 xor      eax, eax
+       8945EC               mov      dword ptr [rbp-14H], eax
+       8B01                 mov      eax, dword ptr [rcx]
+       8B411C               mov      eax, dword ptr [rcx+28]
+       33D2                 xor      edx, edx
+       48894DD0             mov      gword ptr [rbp-30H], rcx
+       8955D8               mov      dword ptr [rbp-28H], edx
+       8945DC               mov      dword ptr [rbp-24H], eax
+       8955E0               mov      dword ptr [rbp-20H], edx
+
+G_M60484_IG03:
+       488D4DD0             lea      rcx, bword ptr [rbp-30H]
+       E89B35665B           call     Enumerator[Int32][System.Int32]:MoveNext():bool:this
+       85C0                 test     eax, eax
+       7418                 je       SHORT G_M60484_IG05
+
+; Body of foreach loop
+
+G_M60484_IG04:
+       8B4DE0               mov      ecx, dword ptr [rbp-20H]
+       8B45EC               mov      eax, dword ptr [rbp-14H]
+       03C1                 add      eax, ecx
+       8945EC               mov      dword ptr [rbp-14H], eax
+       488D4DD0             lea      rcx, bword ptr [rbp-30H]
+       E88335665B           call     Enumerator[Int32][System.Int32]:MoveNext():bool:this
+       85C0                 test     eax, eax
+       75E8                 jne      SHORT G_M60484_IG04
+
+; Normal exit from the implicit try region created by `foreach`
+; Calls the finally to dispose of the iterator
+
+G_M60484_IG05:
+       488BCC               mov      rcx, rsp
+       E80C000000           call     G_M60484_IG09      // call to finally
+
+G_M60484_IG06:
+       90                   nop
+
+G_M60484_IG07:
+       8B45EC               mov      eax, dword ptr [rbp-14H]
+
+G_M60484_IG08:
+       488D65F0             lea      rsp, [rbp-10H]
+       5E                   pop      rsi
+       5F                   pop      rdi
+       5D                   pop      rbp
+       C3                   ret
+
+; Finally funclet. Note it simply sets up and then tears down a stack
+; frame. The dispose method was inlined and is empty.
+
+G_M60484_IG09:
+       55                   push     rbp
+       57                   push     rdi
+       56                   push     rsi
+       4883EC30             sub      rsp, 48
+       488B6920             mov      rbp, qword ptr [rcx+32]
+       48896C2420           mov      qword ptr [rsp+20H], rbp
+       488D6D60             lea      rbp, [rbp+60H]
+
+G_M60484_IG10:
+       4883C430             add      rsp, 48
+       5E                   pop      rsi
+       5F                   pop      rdi
+       5D                   pop      rbp
+       C3                   ret
+```
+
+In such cases the try-finally can be removed, leading to code like the following:
+```asm
+G_M60484_IG01:
+       57                   push     rdi
+       56                   push     rsi
+       4883EC38             sub      rsp, 56
+       488BF1               mov      rsi, rcx
+       488D7C2420           lea      rdi, [rsp+20H]
+       B906000000           mov      ecx, 6
+       33C0                 xor      rax, rax
+       F3AB                 rep stosd
+       488BCE               mov      rcx, rsi
+
+G_M60484_IG02:
+       33F6                 xor      esi, esi
+       8B01                 mov      eax, dword ptr [rcx]
+       8B411C               mov      eax, dword ptr [rcx+28]
+       48894C2420           mov      gword ptr [rsp+20H], rcx
+       89742428             mov      dword ptr [rsp+28H], esi
+       8944242C             mov      dword ptr [rsp+2CH], eax
+       89742430             mov      dword ptr [rsp+30H], esi
+
+G_M60484_IG03:
+       488D4C2420           lea      rcx, bword ptr [rsp+20H]
+       E8A435685B           call     Enumerator[Int32][System.Int32]:MoveNext():bool:this
+       85C0                 test     eax, eax
+       7414                 je       SHORT G_M60484_IG05
+
+G_M60484_IG04:
+       8B4C2430             mov      ecx, dword ptr [rsp+30H]
+       03F1                 add      esi, ecx
+       488D4C2420           lea      rcx, bword ptr [rsp+20H]
+       E89035685B           call     Enumerator[Int32][System.Int32]:MoveNext():bool:this
+       85C0                 test     eax, eax
+       75EC                 jne      SHORT G_M60484_IG04
+
+G_M60484_IG05:
+       8BC6                 mov      eax, esi
+
+G_M60484_IG06:
+       4883C438             add      rsp, 56
+       5E                   pop      rsi
+       5F                   pop      rdi
+       C3                   ret
+```
+
+Empty finally removal is unconditionally profitable: it should always
+reduce code size and improve code speed.
+
+Finally Cloning
+---------------
+
+Finally cloning is an optimization where the jit duplicates the code
+in the finally for one or more of the normal exit paths from the try,
+and has those exit points branch to the duplicated code directly,
+rather than calling the finally.  This transformation allows for
+improved performance and optimization of the common case where the try
+completes without an exception.
+
+Finally cloning also allows hot/cold splitting of finally bodies: the
+cloned finally code covers the normal try exit paths (the hot cases)
+and can be placed in the main method region, and the original finally,
+now used largely or exclusively for exceptional cases (the cold cases)
+spilt off into the cold code region. Without cloning, RyuJit
+would always treat the finally as cold code.
+
+Finally cloning will increase code size, though often the size
+increase is mitigated somewhat by more compact code generation in the
+try body and streamlined invocation of the cloned finallys.
+
+Try-finally regions may have multiple normal exit points. For example
+the following `try` has two: one at the `return 3` and one at the try
+region end:
+
+```C#
+try {
+   if (p) return 3;
+   ...
+}
+finally {
+   ...
+}
+return 4;
+```
+
+Here the finally must be executed no matter how the try exits. So
+there are to two normal exit paths from the try, both of which pass
+through the finally but which then diverge. The fact that some try
+regions can have multiple exits opens the potential for substantial
+code growth from finally cloning, and so leads to a choice point in
+the implementation:
+
+* Share the clone along all exit paths
+* Share the clone along some exit paths
+* Clone along all exit paths
+* Clone along some exit paths
+* Only clone along one exit path
+* Only clone when there is one exit path
+
+The shared clone option must essentially recreate or simulate the
+local call mechanism for the finally, though likely somewhat more
+efficiently. Each exit point must designate where control should
+resume once the shared finally has finished.  For instance the jit
+could introduce a new local per try-finally to determine where the
+cloned finally should resume, and enumerate the possibilities using a
+small integer. The end of the cloned finally would then use a switch
+to determine what code to execute next. This has the downside of
+introducing unrealizable paths into the control flow graph.
+
+Cloning along all exit paths can potentially lead to large amounts of
+code growth.
+
+Cloning along some paths or only one path implies that some normal
+exit paths won't be as well optimized. Nonetheless cloning along one
+path was the choice made by JIT64 and the one we recommend for
+implementation. In particular we suggest only cloning along the end of
+try region exit path, so that any early exit will continue to invoke
+the funclet for finally cleanup (unless that exit happens to have the
+same post-finally continuation as the end try region exit, in which
+case it can simply jump to the cloned finally).
+
+One can imagine adaptive strategies. The size of the finally can
+be roughly estimated and the number of clones needed for full cloning
+readily computed. Selective cloning can be based on profile
+feedback or other similar mechanisms for choosing the profitable
+cases.
+
+The current implementation will clone the finally and retarget the
+last (largest IL offset) leave in the try region to the clone. Any
+other leave that ultimately transfers control to the same post-finally
+offset will also be modified to jump to the clone.
+
+Empirical studies have shown that most finallys are small. Thus to
+avoid excessive code growth, a crude size estimate is formed by
+counting the number of statements in the blocks that make up the
+finally. Any finally larger that 15 statements is not cloned. In our
+study this disqualifed about 0.5% of all finallys from cloning.
+
+### EH Nesting Considerations
+
+Finally cloning is also more complicated when the finally encloses
+other EH regions, since the clone will introduce copies of all these
+regions. While it is possible to implement cloning in such cases we
+propose to defer for now.
+
+Finally cloning is also a bit more complicated if the finally is
+enclosed by another finally region, so we likewise propose deferring
+support for this.  (Seems like a rare enough thing but maybe not too
+hard to handle -- though possibly not worth it if we're not going to
+support the enclosing case).
+
+### Control-Flow and Other Considerations
+
+If the try never exits normally, then the finally can only be invoked
+in exceptional cases. There is no benefit to cloning since the cloned
+finally would be unreachable. We can detect a subset of such cases
+because there will be no call finally blocks.
+
+JIT64 does not clone finallys that contained switch. We propose to
+do likewise. (Initially I did not include this restriction but
+hit a failing test case where the finally contained a switch. Might be
+worth a deeper look, though such cases are presumably rare.)
+
+If the finally never exits normally, then we presume it is cold code,
+and so will not clone.
+
+If the finally is marked as run rarely, we will not clone.
+
+Implementation Proposal
+-----------------------
+
+We propose that empty finally removal and finally cloning be run back
+to back, spliced into the phase list just after fgInline and
+fgAddInternal, and just before implicit by-ref and struct
+promotion. We want to run these early before a lot of structural
+invariants regarding EH are put in place, and before most
+other optimization, but run them after inlining
+(so empty finallys can be more readily identified) and after the
+addition of implicit try-finallys created by the jit.  Empty finallys
+may arise later because of optimization, but this seems relatively
+uncommon.
+
+We will remove empty finallys first, then clone.
+
+Neither optimization will run when the jit is generating debuggable
+code or operating in min opts mode.
+
+### Empty Finally Removal (Sketch)
+
+Skip over methods that have no EH, are compiled with min opts, or
+where the jit is generating debuggable code.
+
+Walk the handler table, looking for try-finally (we could also look
+for and remove try-faults with empty faults, but those are presumably
+rare).
+
+If the finally is a single block and contains only a `retfilter`
+statement, then:
+
+* Retarget the callfinally(s) to jump always to the continuation blocks.
+* Remove the paired jump always block(s) (note we expect all finally
+calls to be paired since the empty finally returns).
+* For funclet EH models with finally target bits, clear the finally
+target from the continuations.
+* For non-funclet EH models only, clear out the GT_END_LFIN statement
+in the finally continuations.
+* Remove the handler block.
+* Reparent all directly contained try blocks to the enclosing try region
+or to the method region if there is no enclosing try.
+* Remove the try-finally from the EH table via `fgRemoveEHTableEntry`.
+
+After the walk, if any empty finallys were removed, revalidate the
+integrity of the handler table.
+
+### Finally Cloning (Sketch)
+
+Skip over all methods, if the runtime suports thread abort. More on
+this below.
+
+Skip over methods that have no EH, are compiled with min opts, or
+where the jit is generating debuggable code.
+
+Walk the handler table, looking for try-finally. If the finally is
+enclosed in a handler or encloses another handler, skip.
+
+Walk the finally body blocks. If any is BBJ_SWITCH, or if none
+is BBJ_EHFINALLYRET, skip cloning. If all blocks are RunRarely
+skip cloning. If the finally has more that 15 statements, skip
+cloning.
+
+Walk the try region from back to front (from largest to smallest IL
+offset). Find the last block in the try that invokes the finally. That
+will be the path that will invoke the clone.
+
+If the EH model requires callfinally thunks, and there are multiple
+thunks that invoke the finally, and the callfinally thunk along the
+clone path is not the first, move it to the front (this helps avoid
+extra jumps).
+
+Set the insertion point to just after the callfinally in the path (for
+thunk models) or the end of the try (for non-thunk models).  Set up a
+block map. Clone the finally body using `fgNewBBinRegion` and
+`fgNewBBafter` to make the first and subsequent blocks, and
+`CloneBlockState` to fill in the block contents. Clear the handler
+region on the cloned blocks. Bail out if cloning fails. Mark the first
+and last cloned blocks with appropriate BBF flags. Patch up inter-clone
+branches and convert the returns into jumps to the continuation.
+
+Walk the callfinallys, retargeting the ones that return to the
+continuation so that they invoke the clone. Remove the paired always
+blocks. Clear the finally target bit and any GT_END_LFIN from the
+continuation.
+
+If all call finallys are converted, modify the region to be try/fault
+(interally EH_HANDLER_FAULT_WAS_FINALLY, so we can distinguish it
+later from "organic" try/faults).  Otherwise leave it as a
+try/finally.
+
+Clear the catch type on the clone entry.
+
+### Thread Abort
+
+For runtimes that support thread abort (desktop), more work is
+required:
+
+* The cloned finally must be reported to the runtime. Likely this
+can trigger off of the BBF_CLONED_FINALLY_BEGIN/END flags.
+* The jit must maintain the integrity of the clone by not losing
+track of the blocks involved, and not allowing code to move in our
+out of the cloned region
+
+Code Size Impact
+----------------
+
+Code size impact from finally cloning was measured for CoreCLR on
+Windows x64.
+
+```
+Total bytes of diff: 16158 (0.12 % of base)
+    diff is a regression.
+Total byte diff includes 0 bytes from reconciling methods
+        Base had    0 unique methods,        0 unique bytes
+        Diff had    0 unique methods,        0 unique bytes
+Top file regressions by size (bytes):
+        3518 : Microsoft.CodeAnalysis.CSharp.dasm (0.16 % of base)
+        1895 : System.Linq.Expressions.dasm (0.32 % of base)
+        1626 : Microsoft.CodeAnalysis.VisualBasic.dasm (0.07 % of base)
+        1428 : System.Threading.Tasks.Parallel.dasm (4.66 % of base)
+        1248 : System.Linq.Parallel.dasm (0.20 % of base)
+Top file improvements by size (bytes):
+       -4529 : System.Private.CoreLib.dasm (-0.14 % of base)
+        -975 : System.Reflection.Metadata.dasm (-0.28 % of base)
+        -239 : System.Private.Uri.dasm (-0.27 % of base)
+        -104 : System.Runtime.InteropServices.RuntimeInformation.dasm (-3.36 % of base)
+         -99 : System.Security.Cryptography.Encoding.dasm (-0.61 % of base)
+57 total files with size differences.
+Top method regessions by size (bytes):
+         645 : System.Diagnostics.Process.dasm - System.Diagnostics.Process:StartCore(ref):bool:this
+         454 : Microsoft.CSharp.dasm - Microsoft.CSharp.RuntimeBinder.Semantics.ExpressionBinder:AdjustCallArgumentsForParams(ref,ref,ref,ref,ref,byref):this
+         447 : System.Threading.Tasks.Dataflow.dasm - System.Threading.Tasks.Dataflow.Internal.SpscTargetCore`1[__Canon][System.__Canon]:ProcessMessagesLoopCore():this
+         421 : Microsoft.CodeAnalysis.VisualBasic.dasm - Microsoft.CodeAnalysis.VisualBasic.Symbols.ImplementsHelper:FindExplicitlyImplementedMember(ref,ref,ref,ref,ref,ref,byref):ref
+         358 : System.Private.CoreLib.dasm - System.Threading.TimerQueueTimer:Change(int,int):bool:this
+Top method improvements by size (bytes):
+       -2512 : System.Private.CoreLib.dasm - DomainNeutralILStubClass:IL_STUB_CLRtoWinRT():ref:this (68 methods)
+        -824 : Microsoft.CodeAnalysis.dasm - Microsoft.Cci.PeWriter:WriteHeaders(ref,ref,ref,ref,byref):this
+        -663 : System.Private.CoreLib.dasm - DomainNeutralILStubClass:IL_STUB_CLRtoWinRT(ref):int:this (17 methods)
+        -627 : System.Private.CoreLib.dasm - System.Diagnostics.Tracing.ManifestBuilder:CreateManifestString():ref:this
+        -546 : System.Private.CoreLib.dasm - DomainNeutralILStubClass:IL_STUB_WinRTtoCLR(long):int:this (67 methods)
+3014 total methods with size differences.
+```
+
+The largest growth is seen in `Process:StartCore`, which has 4
+try-finally constructs.
+
+Diffs generally show improved codegen in the try bodies with cloned
+finallys. However some of this improvement comes from more aggressive
+use of callee save registers, and this causes size inflation in the
+funclets (note finally cloning does not alter the number of
+funclets). So if funclet save/restore could be contained to registers
+used in the funclet, the size impact would be slightly smaller.
+
+There are also some instances where cloning relatively small finallys
+leads to large code size increases. xxx is one example.
diff --git a/src/jit/block.cpp b/src/jit/block.cpp
index 47f1052cc8..bb6a57c25b 100644
--- a/src/jit/block.cpp
+++ b/src/jit/block.cpp
@@ -365,6 +365,14 @@ void BasicBlock::dspFlags()
     {
         printf("KEEP ");
     }
+    if (bbFlags & BBF_CLONED_FINALLY_BEGIN)
+    {
+        printf("cfb ");
+    }
+    if (bbFlags & BBF_CLONED_FINALLY_END)
+    {
+        printf("cfe ");
+    }
 }
 
 /*****************************************************************************
@@ -664,7 +672,7 @@ bool BasicBlock::IsLIR()
 // Return Value:
 //    The first statement in the block's bbTreeList.
 //
-GenTreeStmt* BasicBlock::firstStmt()
+GenTreeStmt* BasicBlock::firstStmt() const
 {
     if (bbTreeList == nullptr)
     {
@@ -683,7 +691,7 @@ GenTreeStmt* BasicBlock::firstStmt()
 // Return Value:
 //    The last statement in the block's bbTreeList.
 //
-GenTreeStmt* BasicBlock::lastStmt()
+GenTreeStmt* BasicBlock::lastStmt() const
 {
     if (bbTreeList == nullptr)
     {
diff --git a/src/jit/block.h b/src/jit/block.h
index 40a1c356a0..cb02afd840 100644
--- a/src/jit/block.h
+++ b/src/jit/block.h
@@ -353,15 +353,18 @@ struct BasicBlock : private LIR::Range
                                        // BBJ_CALLFINALLY block, as well as, on x86, the final step block out of a
                                        // finally.
 
+#define BBF_CLONED_FINALLY_BEGIN 0x100000000 // First block of a cloned finally region
+#define BBF_CLONED_FINALLY_END 0x200000000   // Last block of a cloned finally region
+
 // Flags that relate blocks to loop structure.
 
 #define BBF_LOOP_FLAGS (BBF_LOOP_PREHEADER | BBF_LOOP_HEAD | BBF_LOOP_CALL0 | BBF_LOOP_CALL1)
 
-    bool isRunRarely()
+    bool isRunRarely() const
     {
         return ((bbFlags & BBF_RUN_RARELY) != 0);
     }
-    bool isLoopHead()
+    bool isLoopHead() const
     {
         return ((bbFlags & BBF_LOOP_HEAD) != 0);
     }
@@ -388,7 +391,7 @@ struct BasicBlock : private LIR::Range
 // For example, the top block might or might not have BBF_GC_SAFE_POINT,
 // but we assume it does not have BBF_GC_SAFE_POINT any more.
 
-#define BBF_SPLIT_LOST (BBF_GC_SAFE_POINT | BBF_HAS_JMP | BBF_KEEP_BBJ_ALWAYS)
+#define BBF_SPLIT_LOST (BBF_GC_SAFE_POINT | BBF_HAS_JMP | BBF_KEEP_BBJ_ALWAYS | BBF_CLONED_FINALLY_END)
 
 // Flags gained by the bottom block when a block is split.
 // Note, this is a conservative guess.
@@ -399,7 +402,7 @@ struct BasicBlock : private LIR::Range
 
 #define BBF_SPLIT_GAINED                                                                                               \
     (BBF_DONT_REMOVE | BBF_HAS_LABEL | BBF_HAS_JMP | BBF_BACKWARD_JUMP | BBF_HAS_IDX_LEN | BBF_HAS_NEWARRAY |          \
-     BBF_PROF_WEIGHT | BBF_HAS_NEWOBJ | BBF_KEEP_BBJ_ALWAYS)
+     BBF_PROF_WEIGHT | BBF_HAS_NEWOBJ | BBF_KEEP_BBJ_ALWAYS | BBF_CLONED_FINALLY_END)
 
 #ifndef __GNUC__ // GCC doesn't like C_ASSERT at global scope
     static_assert_no_msg((BBF_SPLIT_NONEXIST & BBF_SPLIT_LOST) == 0);
@@ -962,8 +965,8 @@ struct BasicBlock : private LIR::Range
         return bbNum - 1;
     }
 
-    GenTreeStmt* firstStmt();
-    GenTreeStmt* lastStmt();
+    GenTreeStmt* firstStmt() const;
+    GenTreeStmt* lastStmt() const;
     GenTreeStmt* lastTopLevelStmt();
 
     GenTree* firstNode();
diff --git a/src/jit/compiler.h b/src/jit/compiler.h
index a2ab6b3c12..acf858a1a5 100644
--- a/src/jit/compiler.h
+++ b/src/jit/compiler.h
@@ -3500,6 +3500,10 @@ public:
 
     void fgInline();
 
+    void fgRemoveEmptyFinally();
+
+    void fgCloneFinally();
+
     GenTreePtr fgGetCritSectOfStaticMethod();
 
 #if !defined(_TARGET_X86_)
@@ -4271,6 +4275,7 @@ public:
     void fgDebugCheckNodeLinks(BasicBlock* block, GenTreePtr stmt);
     void fgDebugCheckFlags(GenTreePtr tree);
     void fgDebugCheckFlagsHelper(GenTreePtr tree, unsigned treeFlags, unsigned chkFlags);
+    void fgDebugCheckTryFinallyExits();
 #endif
 
 #ifdef LEGACY_BACKEND
diff --git a/src/jit/compphases.h b/src/jit/compphases.h
index ac1bb636ff..655170f406 100644
--- a/src/jit/compphases.h
+++ b/src/jit/compphases.h
@@ -11,9 +11,10 @@
 // corresponding array of string names of those phases.  This include file undefines CompPhaseNameMacro
 // after the last use.
 // The arguments are:
-//   CompPhaseNameMacro(enumName, stringName, hasChildren, parent)
+//   CompPhaseNameMacro(enumName, stringName, shortName, hasChildren, parent)
 //     "enumName" is an Enumeration-style all-caps name.
 //     "stringName" is a self-explanatory.
+//     "shortName" is an abbreviated form for stringName
 //     "hasChildren" is true if this phase is broken out into subphases.
 //         (We should never do EndPhase on a phase that has children, only on 'leaf phases.')
 //     "parent" is -1 for leaf phases, otherwise it is the "enumName" of the parent phase.
@@ -97,6 +98,9 @@ CompPhaseNameMacro(PHASE_EMIT_GCEH,              "Emit GC+EH tables",
 // for calls through ICorJitInfo across all "real" phases.
 CompPhaseNameMacro(PHASE_CLR_API,                "CLR API calls",                  "CLR-API",  false, -1)
 #endif
+
+CompPhaseNameMacro(PHASE_EMPTY_FINALLY,          "Remove empty finally",           "EMPTYFIN",  false, -1)
+CompPhaseNameMacro(PHASE_CLONE_FINALLY,          "Clone finally",                  "CLONEFIN",  false, -1)
 // clang-format on
 
 #undef CompPhaseNameMacro
diff --git a/src/jit/flowgraph.cpp b/src/jit/flowgraph.cpp
index 441569c339..50c2621231 100644
--- a/src/jit/flowgraph.cpp
+++ b/src/jit/flowgraph.cpp
@@ -8550,8 +8550,12 @@ void Compiler::fgAddInternal()
 GenTreeStmt* Compiler::fgNewStmtFromTree(GenTreePtr tree, BasicBlock* block, IL_OFFSETX offs)
 {
     GenTreeStmt* stmt = gtNewStmt(tree, offs);
-    gtSetStmtInfo(stmt);
-    fgSetStmtSeq(stmt);
+
+    if (fgStmtListThreaded)
+    {
+        gtSetStmtInfo(stmt);
+        fgSetStmtSeq(stmt);
+    }
 
 #if DEBUG
     if (block != nullptr)
@@ -12946,6 +12950,12 @@ bool Compiler::fgOptimizeBranchToEmptyUnconditional(BasicBlock* block, BasicBloc
         optimizeJump = false;
     }
 
+    // Don't optimize a jump to a cloned finally
+    if (bDest->bbFlags & BBF_CLONED_FINALLY_BEGIN)
+    {
+        optimizeJump = false;
+    }
+
 #if FEATURE_EH_FUNCLETS && defined(_TARGET_ARM_)
     // Don't optimize a jump to a finally target. For BB1->BB2->BB3, where
     // BB2 is a finally target, if we changed BB1 to jump directly to BB3,
@@ -22471,3 +22481,1026 @@ void Compiler::fgLclFldAssign(unsigned lclNum)
         lvaSetVarDoNotEnregister(lclNum DEBUGARG(DNER_LocalField));
     }
 }
+
+//------------------------------------------------------------------------
+// fgRemoveEmptyFinally: Remove try/finallys where the finally is empty
+//
+// Notes:
+//    Removes all try/finallys in the method with empty finallys.
+//    These typically arise from inlining empty Dispose methods.
+//
+//    Converts callfinally to a jump to the finally continuation.
+//    Removes the finally, and reparents all blocks in the try to the
+//    enclosing try or method region.
+//
+//    Currently limited to trivially empty finallys: those with one basic
+//    block containing only single RETFILT statement. It is possible but
+//    not likely that more complex-looking finallys will eventually become
+//    empty (from say subsequent optimization). An SPMI run with
+//    just the "detection" part of this phase run after optimization
+//    found only one example where a new empty finally was detected.
+
+void Compiler::fgRemoveEmptyFinally()
+{
+    JITDUMP("\n*************** In fgRemoveEmptyFinally()\n");
+
+    if (compHndBBtabCount == 0)
+    {
+        JITDUMP("No EH in this method, nothing to remove.\n");
+        return;
+    }
+
+    if (opts.MinOpts())
+    {
+        JITDUMP("Method compiled with minOpts, no removal.\n");
+        return;
+    }
+
+    if (opts.compDbgCode)
+    {
+        JITDUMP("Method compiled with debug codegen, no removal.\n");
+        return;
+    }
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("\n*************** Before fgRemoveEmptyFinally()\n");
+        fgDispBasicBlocks();
+        fgDispHandlerTab();
+        printf("\n");
+    }
+#endif // DEBUG
+
+    // Look for finallys or faults that are empty.
+    unsigned finallyCount = 0;
+    unsigned emptyCount   = 0;
+    unsigned XTnum        = 0;
+    while (XTnum < compHndBBtabCount)
+    {
+        EHblkDsc* const HBtab = &compHndBBtab[XTnum];
+
+        // Check if this is a try/finally.  We could also look for empty
+        // try/fault but presumably those are rare.
+        if (!HBtab->HasFinallyHandler())
+        {
+            JITDUMP("EH#%u is not a try-finally; skipping.\n", XTnum);
+            XTnum++;
+            continue;
+        }
+
+        finallyCount++;
+
+        // Look at blocks involved.
+        BasicBlock* const firstBlock = HBtab->ebdHndBeg;
+        BasicBlock* const lastBlock  = HBtab->ebdHndLast;
+
+        // Limit for now to finallys that are single blocks.
+        if (firstBlock != lastBlock)
+        {
+            JITDUMP("EH#%u finally has multiple basic blocks; skipping.\n", XTnum);
+            XTnum++;
+            continue;
+        }
+
+        // Limit for now to finallys that contain only a GT_RETFILT.
+        bool isEmpty = true;
+
+        for (GenTreeStmt* stmt = firstBlock->firstStmt(); stmt != nullptr; stmt = stmt->gtNextStmt)
+        {
+            GenTreePtr stmtExpr = stmt->gtStmtExpr;
+
+            if (stmtExpr->gtOper != GT_RETFILT)
+            {
+                isEmpty = false;
+                break;
+            }
+        }
+
+        if (!isEmpty)
+        {
+            JITDUMP("EH#%u finally is not empty; skipping.\n", XTnum);
+            XTnum++;
+            continue;
+        }
+
+        JITDUMP("EH#%u has empty finally, removing the region.\n", XTnum);
+
+        // Find all the call finallys that invoke this finally,
+        // and modify them to jump to the return point.
+        BasicBlock* firstCallFinallyRangeBlock = nullptr;
+        BasicBlock* endCallFinallyRangeBlock   = nullptr;
+        ehGetCallFinallyBlockRange(XTnum, &firstCallFinallyRangeBlock, &endCallFinallyRangeBlock);
+
+        BasicBlock* currentBlock = firstCallFinallyRangeBlock;
+
+        while (currentBlock != endCallFinallyRangeBlock)
+        {
+            BasicBlock* nextBlock = currentBlock->bbNext;
+
+            if ((currentBlock->bbJumpKind == BBJ_CALLFINALLY) && (currentBlock->bbJumpDest == firstBlock))
+            {
+                // Retarget the call finally to jump to the return
+                // point.
+                //
+                // We don't expect to see retless finallys here, since
+                // the finally is empty.
+                noway_assert(currentBlock->isBBCallAlwaysPair());
+
+                BasicBlock* const leaveBlock          = currentBlock->bbNext;
+                BasicBlock* const postTryFinallyBlock = leaveBlock->bbJumpDest;
+
+                noway_assert(leaveBlock->bbJumpKind == BBJ_ALWAYS);
+
+                currentBlock->bbJumpDest = postTryFinallyBlock;
+                currentBlock->bbJumpKind = BBJ_ALWAYS;
+
+                // Ref count updates.
+                fgAddRefPred(postTryFinallyBlock, currentBlock);
+                // fgRemoveRefPred(firstBlock, currentBlock);
+
+                // Delete the leave block, which should be marked as
+                // keep always.
+                assert((leaveBlock->bbFlags & BBF_KEEP_BBJ_ALWAYS) != 0);
+                nextBlock = leaveBlock->bbNext;
+
+                leaveBlock->bbFlags &= ~BBF_KEEP_BBJ_ALWAYS;
+                fgRemoveBlock(leaveBlock, true);
+
+                // The postTryFinallyBlock may be a finalStep block.
+                // It is now a normal block, so clear the special keep
+                // always flag.
+                postTryFinallyBlock->bbFlags &= ~BBF_KEEP_BBJ_ALWAYS;
+
+#if FEATURE_EH_FUNCLETS && defined(_TARGET_ARM_)
+                // Also, clear the finally target bit for arm
+                fgClearFinallyTargetBit(postTryFinallyBlock);
+#endif // FEATURE_EH_FUNCLETS && defined(_TARGET_ARM_)
+
+#if !FEATURE_EH_FUNCLETS
+                // Remove the GT_END_LFIN from the post-try-finally block.
+                // remove it since there is no finally anymore.
+                GenTreeStmt* endFinallyStmt = postTryFinallyBlock->lastStmt();
+                GenTreePtr   endFinallyExpr = endFinallyStmt->gtStmtExpr;
+                assert(endFinallyExpr->gtOper == GT_END_LFIN);
+                fgRemoveStmt(postTryFinallyBlock, endFinallyStmt);
+#endif // !FEATURE_EH_FUNCLETS
+
+                // Make sure iteration isn't going off the deep end.
+                assert(leaveBlock != endCallFinallyRangeBlock);
+            }
+
+            currentBlock = nextBlock;
+        }
+
+        // Handler block should now be unreferenced, since the only
+        // explicit references to it were in call finallys.
+        firstBlock->bbRefs = 0;
+
+        // Remove the handler block.
+        const bool unreachable = true;
+        firstBlock->bbFlags &= ~BBF_DONT_REMOVE;
+        fgRemoveBlock(firstBlock, unreachable);
+
+        // Find enclosing try region for the try, if any, and update
+        // the try region. Note the handler region (if any) won't
+        // change.
+        BasicBlock* const firstTryBlock = HBtab->ebdTryBeg;
+        BasicBlock* const lastTryBlock  = HBtab->ebdTryLast;
+        assert(firstTryBlock->getTryIndex() == XTnum);
+
+        for (BasicBlock* block = firstTryBlock; block != nullptr; block = block->bbNext)
+        {
+            // Look for blocks directly contained in this try, and
+            // update the try region appropriately.
+            //
+            // Try region for blocks transitively contained (say in a
+            // child try) will get updated by the subsequent call to
+            // fgRemoveEHTableEntry.
+            if (block->getTryIndex() == XTnum)
+            {
+                if (firstBlock->hasTryIndex())
+                {
+                    block->setTryIndex(firstBlock->getTryIndex());
+                }
+                else
+                {
+                    block->clearTryIndex();
+                }
+            }
+
+            if (block == firstTryBlock)
+            {
+                assert((block->bbFlags & BBF_TRY_BEG) != 0);
+                block->bbFlags &= ~BBF_TRY_BEG;
+            }
+
+            if (block == lastTryBlock)
+            {
+                break;
+            }
+        }
+
+        // Remove the try-finally EH region. This will compact the EH table
+        // so XTnum now points at the next entry.
+        fgRemoveEHTableEntry(XTnum);
+
+        emptyCount++;
+    }
+
+    if (emptyCount > 0)
+    {
+        JITDUMP("fgRemoveEmptyFinally() removed %u try-finally clauses from %u finallys\n", emptyCount, finallyCount);
+
+#ifdef DEBUG
+        if (verbose)
+        {
+            printf("\n*************** After fgRemoveEmptyFinally()\n");
+            fgDispBasicBlocks();
+            fgDispHandlerTab();
+            printf("\n");
+        }
+
+        fgVerifyHandlerTab();
+        fgDebugCheckBBlist(false, false);
+
+#endif // DEBUG
+    }
+}
+
+//------------------------------------------------------------------------
+// fgCloneFinally: Optimize normal exit path from a try/finally
+//
+// Notes:
+//    Handles finallys that are not enclosed by or enclosing other
+//    handler regions.
+//
+//    Converts the "normal exit" callfinally to a jump to a cloned copy
+//    of the finally, which in turn jumps to the finally continuation.
+//
+//    If all callfinallys for a given finally are converted to jump to
+//    the clone, the try-finally is modified into a try-fault,
+//    distingushable from organic try-faults by handler type
+//    EH_HANDLER_FAULT_WAS_FINALLY vs the organic EH_HANDLER_FAULT.
+//
+//    Does not yet handle thread abort. The open issues here are how
+//    to maintain the proper description of the cloned finally blocks
+//    as a handler (for thread abort purposes), how to prevent code
+//    motion in or out of these blocks, and how to report this cloned
+//    handler to the runtime. Some building blocks for thread abort
+//    exist (see below) but more work needed.
+//
+//    The first and last blocks of the cloned finally are marked with
+//    BBF_CLONED_FINALLY_BEGIN and BBF_CLONED_FINALLY_END. However
+//    these markers currently can get lost during subsequent
+//    optimizations.
+
+void Compiler::fgCloneFinally()
+{
+    JITDUMP("\n*************** In fgCloneFinally()\n");
+
+#if FEATURE_CORECLR
+    bool enableCloning = true;
+#else
+    // Finally cloning currently doesn't provide sufficient protection
+    // for the cloned code in the presence of thread abort.
+    bool enableCloning = false;
+#endif // FEATURE_CORECLR
+
+#if DEBUG
+    // Allow override to enable/disable.
+    enableCloning = (JitConfig.JitEnableFinallyCloning() == 1);
+#endif // DEBUG
+
+    if (!enableCloning)
+    {
+        JITDUMP("Finally cloning disabled.\n");
+        return;
+    }
+
+    if (compHndBBtabCount == 0)
+    {
+        JITDUMP("No EH in this method, no cloning.\n");
+        return;
+    }
+
+    if (opts.MinOpts())
+    {
+        JITDUMP("Method compiled with minOpts, no cloning.\n");
+        return;
+    }
+
+    if (opts.compDbgCode)
+    {
+        JITDUMP("Method compiled with debug codegen, no cloning.\n");
+        return;
+    }
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("\n*************** Before fgCloneFinally()\n");
+        fgDispBasicBlocks();
+        fgDispHandlerTab();
+        printf("\n");
+    }
+
+    // Verify try-finally exits look good before we start.
+    fgDebugCheckTryFinallyExits();
+
+#endif // DEBUG
+
+    // Look for finallys that are not contained within other handlers,
+    // and which do not themselves contain EH.
+    //
+    // Note these cases potentially could be handled, but are less
+    // obviously profitable and require modification of the handler
+    // table.
+    unsigned  XTnum      = 0;
+    EHblkDsc* HBtab      = compHndBBtab;
+    unsigned  cloneCount = 0;
+    for (; XTnum < compHndBBtabCount; XTnum++, HBtab++)
+    {
+        // Check if this is a try/finally
+        if (!HBtab->HasFinallyHandler())
+        {
+            JITDUMP("EH#%u is not a try-finally; skipping.\n", XTnum);
+            continue;
+        }
+
+        // Check if enclosed by another handler.
+        const unsigned enclosingHandlerRegion = ehGetEnclosingHndIndex(XTnum);
+
+        if (enclosingHandlerRegion != EHblkDsc::NO_ENCLOSING_INDEX)
+        {
+            JITDUMP("EH#%u is enclosed by handler EH#%u; skipping.\n", XTnum, enclosingHandlerRegion);
+            continue;
+        }
+
+        bool     containsEH                   = false;
+        unsigned exampleEnclosedHandlerRegion = 0;
+
+        // Only need to look at lower numbered regions because the
+        // handler table is ordered by nesting.
+        for (unsigned i = 0; i < XTnum; i++)
+        {
+            if (ehGetEnclosingHndIndex(i) == XTnum)
+            {
+                exampleEnclosedHandlerRegion = i;
+                containsEH                   = true;
+                break;
+            }
+        }
+
+        if (containsEH)
+        {
+            JITDUMP("Finally for EH#%u encloses handler EH#%u; skipping.\n", XTnum, exampleEnclosedHandlerRegion);
+            continue;
+        }
+
+        // Look at blocks involved.
+        BasicBlock* const firstBlock = HBtab->ebdHndBeg;
+        BasicBlock* const lastBlock  = HBtab->ebdHndLast;
+        assert(firstBlock != nullptr);
+        assert(lastBlock != nullptr);
+        BasicBlock* nextBlock       = lastBlock->bbNext;
+        unsigned    regionBBCount   = 0;
+        unsigned    regionStmtCount = 0;
+        bool        hasFinallyRet   = false;
+        bool        isAllRare       = true;
+        bool        hasSwitch       = false;
+
+        for (const BasicBlock* block = firstBlock; block != nextBlock; block = block->bbNext)
+        {
+            if (block->bbJumpKind == BBJ_SWITCH)
+            {
+                hasSwitch = true;
+                break;
+            }
+
+            regionBBCount++;
+
+            // Should we compute statement cost here, or is it
+            // premature...? For now just count statements I guess.
+            for (GenTreeStmt* stmt = block->firstStmt(); stmt != nullptr; stmt = stmt->gtNextStmt)
+            {
+                regionStmtCount++;
+            }
+
+            hasFinallyRet = hasFinallyRet || (block->bbJumpKind == BBJ_EHFINALLYRET);
+            isAllRare     = isAllRare && block->isRunRarely();
+        }
+
+        // Skip cloning if the finally has a switch.
+        if (hasSwitch)
+        {
+            JITDUMP("Finally in EH#%u has a switch; skipping.\n", XTnum);
+            continue;
+        }
+
+        // Skip cloning if the finally must throw.
+        if (!hasFinallyRet)
+        {
+            JITDUMP("Finally in EH#%u does not return; skipping.\n", XTnum);
+            continue;
+        }
+
+        // Skip cloning if the finally is rarely run code.
+        if (isAllRare)
+        {
+            JITDUMP("Finally in EH#%u is run rarely; skipping.\n", XTnum);
+            continue;
+        }
+
+        // Empirical studies from CoreCLR and CoreFX show that less
+        // that 1% of finally regions have more than 15
+        // statements. So, to avoid potentially excessive code growth,
+        // only clone finallys that have 15 or fewer statements.
+        const unsigned stmtCountLimit = 15;
+        if (regionStmtCount > stmtCountLimit)
+        {
+            JITDUMP("Finally in EH#%u has %u statements, limit is %u; skipping.\n", XTnum, regionStmtCount,
+                    stmtCountLimit);
+            continue;
+        }
+
+        JITDUMP("EH#%u is a candidate for finally cloning:"
+                " %u blocks, %u statements\n",
+                XTnum, regionBBCount, regionStmtCount);
+
+        // Walk the try region backwards looking for the last block
+        // that transfers control to a callfinally.
+        BasicBlock* const firstTryBlock = HBtab->ebdTryBeg;
+        BasicBlock* const lastTryBlock  = HBtab->ebdTryLast;
+        assert(firstTryBlock->getTryIndex() == XTnum);
+        assert(lastTryBlock->getTryIndex() == XTnum);
+        BasicBlock* const beforeTryBlock = firstTryBlock->bbPrev;
+
+        BasicBlock* normalCallFinallyBlock   = nullptr;
+        BasicBlock* normalCallFinallyReturn  = nullptr;
+        BasicBlock* cloneInsertAfter         = HBtab->ebdTryLast;
+        bool        tryToRelocateCallFinally = false;
+
+        for (BasicBlock* block = lastTryBlock; block != beforeTryBlock; block = block->bbPrev)
+        {
+#if FEATURE_EH_CALLFINALLY_THUNKS
+            // Look for blocks that are always jumps to a call finally
+            // pair that targets our finally.
+            if (block->bbJumpKind != BBJ_ALWAYS)
+            {
+                continue;
+            }
+
+            BasicBlock* const jumpDest = block->bbJumpDest;
+
+            if (!jumpDest->isBBCallAlwaysPair() || (jumpDest->bbJumpDest != firstBlock))
+            {
+                continue;
+            }
+#else
+            // Look for call finally pair directly within the try
+            if (!block->isBBCallAlwaysPair() || (block->bbJumpDest != firstBlock))
+            {
+                continue;
+            }
+
+            BasicBlock* const jumpDest = block;
+#endif // FEATURE_EH_CALLFINALLY_THUNKS
+
+            // Found our block.
+            BasicBlock* const finallyReturnBlock  = jumpDest->bbNext;
+            BasicBlock* const postTryFinallyBlock = finallyReturnBlock->bbJumpDest;
+
+            normalCallFinallyBlock  = jumpDest;
+            normalCallFinallyReturn = postTryFinallyBlock;
+
+#if FEATURE_EH_CALLFINALLY_THUNKS
+            // When there are callfinally thunks, we don't expect to see the
+            // callfinally within a handler region either.
+            assert(!jumpDest->hasHndIndex());
+
+            // Update the clone insertion point to just after the
+            // call always pair.
+            cloneInsertAfter = finallyReturnBlock;
+
+            // We will consider moving the callfinally so we can fall
+            // through from the try into the clone.
+            tryToRelocateCallFinally = true;
+
+            JITDUMP("Chose path to clone: try block BB%02u jumps to callfinally at BB%02u;"
+                    " the call returns to BB%02u which jumps to BB%02u\n",
+                    block->bbNum, jumpDest->bbNum, finallyReturnBlock->bbNum, postTryFinallyBlock->bbNum);
+#else
+            JITDUMP("Chose path to clone: try block BB%02u is a callfinally;"
+                    " the call returns to BB%02u which jumps to BB%02u\n",
+                    block->bbNum, finallyReturnBlock->bbNum, postTryFinallyBlock->bbNum);
+#endif // FEATURE_EH_CALLFINALLY_THUNKS
+
+            break;
+        }
+
+        // If there is no call to the finally, don't clone.
+        if (normalCallFinallyBlock == nullptr)
+        {
+            JITDUMP("EH#%u: no calls from the try to the finally, skipping.\n", XTnum);
+            continue;
+        }
+
+        JITDUMP("Will update callfinally block BB%02u to jump to the clone;"
+                " clone will jump to BB%02u\n",
+                normalCallFinallyBlock->bbNum, normalCallFinallyReturn->bbNum);
+
+        // If there are multiple callfinallys and we're in the
+        // callfinally thunk model, all the callfinallys are placed
+        // just outside the try region. We'd like our chosen
+        // callfinally to come first after the try, so we can fall out of the try
+        // into the clone.
+        BasicBlock* firstCallFinallyRangeBlock = nullptr;
+        BasicBlock* endCallFinallyRangeBlock   = nullptr;
+        ehGetCallFinallyBlockRange(XTnum, &firstCallFinallyRangeBlock, &endCallFinallyRangeBlock);
+
+        if (tryToRelocateCallFinally)
+        {
+            BasicBlock* firstCallFinallyBlock = nullptr;
+
+            for (BasicBlock* block = firstCallFinallyRangeBlock; block != endCallFinallyRangeBlock;
+                 block             = block->bbNext)
+            {
+                if (block->isBBCallAlwaysPair())
+                {
+                    if (block->bbJumpDest == firstBlock)
+                    {
+                        firstCallFinallyBlock = block;
+                        break;
+                    }
+                }
+            }
+
+            // We better have found at least one call finally.
+            assert(firstCallFinallyBlock != nullptr);
+
+            // If there is more than one callfinally, move the one we are
+            // going to retarget to be first in the callfinally range.
+            if (firstCallFinallyBlock != normalCallFinallyBlock)
+            {
+                JITDUMP("Moving callfinally BB%02u to be first in line, before BB%02u\n", normalCallFinallyBlock->bbNum,
+                        firstCallFinallyBlock->bbNum);
+
+                BasicBlock* const firstToMove      = normalCallFinallyBlock;
+                BasicBlock* const lastToMove       = normalCallFinallyBlock->bbNext;
+                BasicBlock* const placeToMoveAfter = firstCallFinallyBlock->bbPrev;
+
+                fgUnlinkRange(firstToMove, lastToMove);
+                fgMoveBlocksAfter(firstToMove, lastToMove, placeToMoveAfter);
+
+#ifdef DEBUG
+                // Sanity checks
+                fgDebugCheckBBlist(false, false);
+                fgVerifyHandlerTab();
+#endif // DEBUG
+
+                assert(nextBlock == lastBlock->bbNext);
+
+                // Update where the callfinally range begins, since we might
+                // have altered this with callfinally rearrangement, and/or
+                // the range begin might have been pretty loose to begin with.
+                firstCallFinallyRangeBlock = normalCallFinallyBlock;
+            }
+        }
+
+        // Clone the finally and retarget the normal return path and
+        // any other path that happens to share that same return
+        // point. For instance a construct like:
+        //
+        //  try { } catch { } finally { }
+        //
+        // will have two call finally blocks, one for the normal exit
+        // from the try, and the the other for the exit from the
+        // catch. They'll both pass the same return point which is the
+        // statement after the finally, so they can share the clone.
+        //
+        // Clone the finally body, and splice it into the flow graph
+        // within in the parent region of the try.
+        const unsigned  finallyTryIndex = firstBlock->bbTryIndex;
+        BasicBlock*     insertAfter     = nullptr;
+        BlockToBlockMap blockMap(getAllocator());
+        bool            clonedOk     = true;
+        unsigned        cloneBBCount = 0;
+
+        for (BasicBlock* block = firstBlock; block != nextBlock; block = block->bbNext)
+        {
+            BasicBlock* newBlock;
+
+            if (block == firstBlock)
+            {
+                // Put first cloned finally block into the approprate
+                // region, somewhere within or after the range of
+                // callfinallys, depending on the EH implementation.
+                const unsigned    hndIndex = 0;
+                BasicBlock* const nearBlk  = cloneInsertAfter;
+                newBlock                   = fgNewBBinRegion(block->bbJumpKind, finallyTryIndex, hndIndex, nearBlk);
+
+                // If the clone ends up just after the finally, adjust
+                // the stopping point for finally traversal.
+                if (newBlock->bbNext == nextBlock)
+                {
+                    assert(newBlock->bbPrev == lastBlock);
+                    nextBlock = newBlock;
+                }
+            }
+            else
+            {
+                // Put subsequent blocks in the same region...
+                const bool extendRegion = true;
+                newBlock                = fgNewBBafter(block->bbJumpKind, insertAfter, extendRegion);
+            }
+
+            cloneBBCount++;
+            assert(cloneBBCount <= regionBBCount);
+
+            insertAfter = newBlock;
+            blockMap.Set(block, newBlock);
+
+            clonedOk = BasicBlock::CloneBlockState(this, newBlock, block);
+
+            if (!clonedOk)
+            {
+                break;
+            }
+
+            // Update block flags. Note a block can be both first and last.
+            if (block == firstBlock)
+            {
+                // Mark the block as the start of the cloned finally.
+                newBlock->bbFlags |= BBF_CLONED_FINALLY_BEGIN;
+            }
+
+            if (block == lastBlock)
+            {
+                // Mark the block as the end of the cloned finally.
+                newBlock->bbFlags |= BBF_CLONED_FINALLY_END;
+            }
+
+            // Make sure clone block state hasn't munged the try region.
+            assert(newBlock->bbTryIndex == finallyTryIndex);
+
+            // Cloned handler block is no longer within the handler.
+            newBlock->clearHndIndex();
+
+            // Jump dests are set in a post-pass; make sure CloneBlockState hasn't tried to set them.
+            assert(newBlock->bbJumpDest == nullptr);
+        }
+
+        if (!clonedOk)
+        {
+            // TODO: cleanup the partial clone?
+            JITDUMP("Unable to clone the finally; skipping.\n");
+            continue;
+        }
+
+        // We should have cloned all the finally region blocks.
+        assert(cloneBBCount == regionBBCount);
+
+        JITDUMP("Cloned finally blocks are: BB%2u ... BB%2u\n", blockMap[firstBlock]->bbNum,
+                blockMap[lastBlock]->bbNum);
+
+        // Redirect redirect any branches within the newly-cloned
+        // finally, and any finally returns to jump to the return
+        // point.
+        for (BasicBlock* block = firstBlock; block != nextBlock; block = block->bbNext)
+        {
+            BasicBlock* newBlock = blockMap[block];
+
+            if (block->bbJumpKind == BBJ_EHFINALLYRET)
+            {
+                GenTreeStmt* finallyRet     = newBlock->lastStmt();
+                GenTreePtr   finallyRetExpr = finallyRet->gtStmtExpr;
+                assert(finallyRetExpr->gtOper == GT_RETFILT);
+                fgRemoveStmt(newBlock, finallyRet);
+                newBlock->bbJumpKind = BBJ_ALWAYS;
+                newBlock->bbJumpDest = normalCallFinallyReturn;
+
+                fgAddRefPred(normalCallFinallyReturn, newBlock);
+            }
+            else
+            {
+                optCopyBlkDest(block, newBlock);
+                optRedirectBlock(newBlock, &blockMap);
+            }
+        }
+
+        // Modify the targeting call finallys to branch to the cloned
+        // finally. Make a note if we see some calls that can't be
+        // retargeted (since they want to return to other places).
+        BasicBlock* const firstCloneBlock    = blockMap[firstBlock];
+        bool              retargetedAllCalls = true;
+        BasicBlock*       currentBlock       = firstCallFinallyRangeBlock;
+
+        while (currentBlock != endCallFinallyRangeBlock)
+        {
+            BasicBlock* nextBlockToScan = currentBlock->bbNext;
+
+            if (currentBlock->isBBCallAlwaysPair())
+            {
+                if (currentBlock->bbJumpDest == firstBlock)
+                {
+                    BasicBlock* const leaveBlock          = currentBlock->bbNext;
+                    BasicBlock* const postTryFinallyBlock = leaveBlock->bbJumpDest;
+
+                    // Note we must retarget all callfinallies that have this
+                    // continuation, or we can't clean up the continuation
+                    // block properly below, since it will be reachable both
+                    // by the cloned finally and by the called finally.
+                    if (postTryFinallyBlock == normalCallFinallyReturn)
+                    {
+                        // This call returns to the expected spot, so
+                        // retarget it to branch to the clone.
+                        currentBlock->bbJumpDest = firstCloneBlock;
+                        currentBlock->bbJumpKind = BBJ_ALWAYS;
+
+                        // Ref count updates.
+                        fgAddRefPred(firstCloneBlock, currentBlock);
+                        // fgRemoveRefPred(firstBlock, currentBlock);
+
+                        // Delete the leave block, which should be marked as
+                        // keep always.
+                        assert((leaveBlock->bbFlags & BBF_KEEP_BBJ_ALWAYS) != 0);
+                        nextBlock = leaveBlock->bbNext;
+
+                        leaveBlock->bbFlags &= ~BBF_KEEP_BBJ_ALWAYS;
+                        fgRemoveBlock(leaveBlock, true);
+
+                        // Make sure iteration isn't going off the deep end.
+                        assert(leaveBlock != endCallFinallyRangeBlock);
+                    }
+                    else
+                    {
+                        // We can't retarget this call since it
+                        // returns somewhere else.
+                        retargetedAllCalls = false;
+                    }
+                }
+            }
+
+            currentBlock = nextBlockToScan;
+        }
+
+        // If we retargeted all calls, modify EH descriptor to be
+        // try-fault instead of try-finally, and then non-cloned
+        // finally catch type to be fault.
+        if (retargetedAllCalls)
+        {
+            JITDUMP("All callfinallys retargeted; changing finally to fault.\n");
+            HBtab->ebdHandlerType  = EH_HANDLER_FAULT_WAS_FINALLY;
+            firstBlock->bbCatchTyp = BBCT_FAULT;
+        }
+        else
+        {
+            JITDUMP("Some callfinallys *not* retargeted, so region must remain as a finally.\n");
+        }
+
+        // Modify first block of cloned finally to be a "normal" block.
+        BasicBlock* firstClonedBlock = blockMap[firstBlock];
+        firstClonedBlock->bbCatchTyp = BBCT_NONE;
+
+        // The normalCallFinallyReturn may be a finalStep block.  It
+        // is now a normal block, since all the callfinallies that
+        // return to it are now going via the clone, so clear the
+        // special keep always flag.
+        normalCallFinallyReturn->bbFlags &= ~BBF_KEEP_BBJ_ALWAYS;
+
+#if FEATURE_EH_FUNCLETS && defined(_TARGET_ARM_)
+        // Also, clear the finally target bit for arm
+        fgClearFinallyTargetBit(normalCallFinallyReturn);
+#endif // FEATURE_EH_FUNCLETS && defined(_TARGET_ARM_)
+
+#if !FEATURE_EH_FUNCLETS
+        // Remove the GT_END_LFIN from the normalCallFinallyReturn
+        // since no callfinally returns there anymore.
+        GenTreeStmt* endFinallyStmt = normalCallFinallyReturn->lastStmt();
+        GenTreePtr   endFinallyExpr = endFinallyStmt->gtStmtExpr;
+        assert(endFinallyExpr->gtOper == GT_END_LFIN);
+        fgRemoveStmt(normalCallFinallyReturn, endFinallyStmt);
+#endif
+
+        // Todo -- mark cloned blocks as a cloned finally....
+
+        // Done!
+        JITDUMP("\nDone with EH#%u\n\n", XTnum);
+        cloneCount++;
+    }
+
+    if (cloneCount > 0)
+    {
+        JITDUMP("fgCloneFinally() cloned %u finally handlers\n", cloneCount);
+
+#ifdef DEBUG
+        if (verbose)
+        {
+            printf("\n*************** After fgCloneFinally()\n");
+            fgDispBasicBlocks();
+            fgDispHandlerTab();
+            printf("\n");
+        }
+
+        fgVerifyHandlerTab();
+        fgDebugCheckBBlist(false, false);
+        fgDebugCheckTryFinallyExits();
+
+#endif // DEBUG
+    }
+}
+
+#ifdef DEBUG
+
+//------------------------------------------------------------------------
+// fgDebugCheckTryFinallyExits: validate normal flow from try-finally
+// or try-fault-was-finally.
+//
+// Notes:
+//
+// Normal control flow exiting the try block of a try-finally must
+// pass through the finally. This checker attempts to verify that by
+// looking at the control flow graph.
+//
+// Each path that exits the try of a try-finally (including try-faults
+// that were optimized into try-finallys by fgCloneFinally) should
+// thus either execute a callfinally to the associated finally or else
+// jump to a block with the BBF_CLONED_FINALLY_BEGIN flag set.
+//
+// Depending on when this check is done, there may also be an empty
+// block along the path.
+//
+// Depending on the model for invoking finallys, the callfinallies may
+// lie within the try region (callfinally thunks) or in the enclosing
+// region.
+
+void Compiler::fgDebugCheckTryFinallyExits()
+{
+    unsigned  XTnum            = 0;
+    EHblkDsc* HBtab            = compHndBBtab;
+    unsigned  cloneCount       = 0;
+    bool      allTryExitsValid = true;
+    for (; XTnum < compHndBBtabCount; XTnum++, HBtab++)
+    {
+        const EHHandlerType handlerType = HBtab->ebdHandlerType;
+        const bool          isFinally   = (handlerType == EH_HANDLER_FINALLY);
+        const bool          wasFinally  = (handlerType == EH_HANDLER_FAULT_WAS_FINALLY);
+
+        // Screen out regions that are or were not finallys.
+        if (!isFinally && !wasFinally)
+        {
+            continue;
+        }
+
+        // Walk blocks of the try, looking for normal control flow to
+        // an ancestor region.
+
+        BasicBlock* const firstTryBlock = HBtab->ebdTryBeg;
+        BasicBlock* const lastTryBlock  = HBtab->ebdTryLast;
+        assert(firstTryBlock->getTryIndex() <= XTnum);
+        assert(lastTryBlock->getTryIndex() <= XTnum);
+        BasicBlock* const afterTryBlock = lastTryBlock->bbNext;
+        BasicBlock* const finallyBlock  = isFinally ? HBtab->ebdHndBeg : nullptr;
+
+        for (BasicBlock* block = firstTryBlock; block != afterTryBlock; block = block->bbNext)
+        {
+            // Only check the directly contained blocks.
+            assert(block->hasTryIndex());
+
+            if (block->getTryIndex() != XTnum)
+            {
+                continue;
+            }
+
+            // Look at each of the normal control flow possibilities.
+            const unsigned numSuccs = block->NumSucc();
+
+            for (unsigned i = 0; i < numSuccs; i++)
+            {
+                BasicBlock* const succBlock = block->GetSucc(i);
+
+                if (succBlock->hasTryIndex() && succBlock->getTryIndex() <= XTnum)
+                {
+                    // Successor does not exit this try region.
+                    continue;
+                }
+
+#if FEATURE_EH_CALLFINALLY_THUNKS
+
+                // When there are callfinally thunks, callfinallies
+                // logically "belong" to a child region and the exit
+                // path validity will be checked when looking at the
+                // try blocks in that region.
+                if (block->bbJumpKind == BBJ_CALLFINALLY)
+                {
+                    continue;
+                }
+
+#endif // FEATURE_EH_CALLFINALLY_THUNKS
+
+                // Now we know block lies directly within the try of a
+                // try-finally, and succBlock is in an enclosing
+                // region (possibly the method region). So this path
+                // represents flow out of the try and should be
+                // checked.
+                //
+                // There are various ways control can properly leave a
+                // try-finally (or try-fault-was-finally):
+                //
+                // (a1) via a jump to a callfinally (only for finallys, only for call finally thunks)
+                // (a2) via a callfinally (only for finallys, only for !call finally thunks)
+                // (b) via a jump to a begin finally clone block
+                // (c) via a jump to an empty block to (b)
+                // (d) via a fallthrough to an empty block to (b)
+                // (e) via the always half of a callfinally pair
+                // (f) via an always jump clonefinally exit
+                bool isCallToFinally = false;
+
+#if FEATURE_EH_CALLFINALLY_THUNKS
+                if (succBlock->bbJumpKind == BBJ_CALLFINALLY)
+                {
+                    // case (a1)
+                    isCallToFinally = isFinally && (succBlock->bbJumpDest == finallyBlock);
+                }
+#else
+                if (block->bbJumpKind == BBJ_CALLFINALLY)
+                {
+                    // case (a2)
+                    isCallToFinally = isFinally && (block->bbJumpDest == finallyBlock);
+                }
+#endif // FEATURE_EH_CALLFINALLY_THUNKS
+
+                bool isJumpToClonedFinally = false;
+
+                if (succBlock->bbFlags & BBF_CLONED_FINALLY_BEGIN)
+                {
+                    // case (b)
+                    isJumpToClonedFinally = true;
+                }
+                else if (succBlock->bbJumpKind == BBJ_ALWAYS)
+                {
+                    if (succBlock->isEmpty())
+                    {
+                        // case (c)
+                        BasicBlock* const succSuccBlock = succBlock->bbJumpDest;
+
+                        if (succSuccBlock->bbFlags & BBF_CLONED_FINALLY_BEGIN)
+                        {
+                            isJumpToClonedFinally = true;
+                        }
+                    }
+                }
+                else if (succBlock->bbJumpKind == BBJ_NONE)
+                {
+                    if (succBlock->isEmpty())
+                    {
+                        BasicBlock* const succSuccBlock = succBlock->bbNext;
+
+                        // case (d)
+                        if (succSuccBlock->bbFlags & BBF_CLONED_FINALLY_BEGIN)
+                        {
+                            isJumpToClonedFinally = true;
+                        }
+                    }
+                }
+
+                bool isReturnFromFinally = false;
+
+                // Case (e). Ideally we'd have something stronger to
+                // check here -- eg that we are returning from a call
+                // to the right finally -- but there are odd cases
+                // like orphaned second halves of callfinally pairs
+                // that we need to tolerate.
+                if (block->bbFlags & BBF_KEEP_BBJ_ALWAYS)
+                {
+                    isReturnFromFinally = true;
+                }
+
+                // Case (f)
+                if (block->bbFlags & BBF_CLONED_FINALLY_END)
+                {
+                    isReturnFromFinally = true;
+                }
+
+                const bool thisExitValid = isCallToFinally || isJumpToClonedFinally || isReturnFromFinally;
+
+                if (!thisExitValid)
+                {
+                    JITDUMP("fgCheckTryFinallyExitS: EH#%u exit via BB%02u -> BB%02u is invalid\n", XTnum, block->bbNum,
+                            succBlock->bbNum);
+                }
+
+                allTryExitsValid = allTryExitsValid & thisExitValid;
+            }
+        }
+    }
+
+    if (!allTryExitsValid)
+    {
+        JITDUMP("fgCheckTryFinallyExits: method contains invalid try exit paths\n");
+        assert(allTryExitsValid);
+    }
+}
+
+#endif // DEBUG
diff --git a/src/jit/jitconfigvalues.h b/src/jit/jitconfigvalues.h
index 39a2505246..8a25af05eb 100644
--- a/src/jit/jitconfigvalues.h
+++ b/src/jit/jitconfigvalues.h
@@ -274,6 +274,14 @@ CONFIG_INTEGER(JitInlinePolicyModel, W("JitInlinePolicyModel"), 0)
 
 CONFIG_INTEGER(JitEECallTimingInfo, W("JitEECallTimingInfo"), 0)
 
+#if defined(DEBUG)
+#if defined(FEATURE_CORECLR)
+CONFIG_INTEGER(JitEnableFinallyCloning, W("JitEnableFinallyCloning"), 1)
+#else
+CONFIG_INTEGER(JitEnableFinallyCloning, W("JitEnableFinallyCloning"), 0)
+#endif // defined(FEATURE_CORECLR)
+#endif // DEBUG
+
 #undef CONFIG_INTEGER
 #undef CONFIG_STRING
 #undef CONFIG_METHODSET
diff --git a/src/jit/jiteh.cpp b/src/jit/jiteh.cpp
index 01a2fdce83..2d0eee366f 100644
--- a/src/jit/jiteh.cpp
+++ b/src/jit/jiteh.cpp
@@ -93,7 +93,7 @@ bool EHblkDsc::HasFinallyHandler()
 
 bool EHblkDsc::HasFaultHandler()
 {
-    return ebdHandlerType == EH_HANDLER_FAULT;
+    return (ebdHandlerType == EH_HANDLER_FAULT) || (ebdHandlerType == EH_HANDLER_FAULT_WAS_FINALLY);
 }
 
 bool EHblkDsc::HasFinallyOrFaultHandler()
diff --git a/src/jit/jiteh.h b/src/jit/jiteh.h
index 573116282c..502d2153c2 100644
--- a/src/jit/jiteh.h
+++ b/src/jit/jiteh.h
@@ -27,7 +27,8 @@ enum EHHandlerType
     EH_HANDLER_CATCH = 0x1, // Don't use zero (to aid debugging uninitialized memory)
     EH_HANDLER_FILTER,
     EH_HANDLER_FAULT,
-    EH_HANDLER_FINALLY
+    EH_HANDLER_FINALLY,
+    EH_HANDLER_FAULT_WAS_FINALLY
 };
 
 // ToCORINFO_EH_CLAUSE_FLAGS: Convert an internal EHHandlerType to a CORINFO_EH_CLAUSE_FLAGS value
@@ -41,6 +42,7 @@ inline CORINFO_EH_CLAUSE_FLAGS ToCORINFO_EH_CLAUSE_FLAGS(EHHandlerType type)
         case EH_HANDLER_FILTER:
             return CORINFO_EH_CLAUSE_FILTER;
         case EH_HANDLER_FAULT:
+        case EH_HANDLER_FAULT_WAS_FINALLY:
             return CORINFO_EH_CLAUSE_FAULT;
         case EH_HANDLER_FINALLY:
             return CORINFO_EH_CLAUSE_FINALLY;
diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp
index d2a6843b68..24872dee45 100644
--- a/src/jit/morph.cpp
+++ b/src/jit/morph.cpp
@@ -16910,6 +16910,14 @@ void Compiler::fgMorph()
     fgDebugCheckBBlist(false, false);
 #endif // DEBUG
 
+    fgRemoveEmptyFinally();
+
+    EndPhase(PHASE_EMPTY_FINALLY);
+
+    fgCloneFinally();
+
+    EndPhase(PHASE_CLONE_FINALLY);
+
     /* For x64 and ARM64 we need to mark irregular parameters early so that they don't get promoted */
     fgMarkImplicitByRefArgs();