summaryrefslogtreecommitdiff
path: root/shaders/h264/mc/Intra_8x8.asm
blob: 05a0be59d538b24570fc0b4473190292d4dcf9fb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/*
 * Decode Intra_8x8 macroblock
 * Copyright © <2010>, Intel Corporation.
 *
 * This program is licensed under the terms and conditions of the
 * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
 * http://www.opensource.org/licenses/eclipse-1.0.php.
 *
 */
// Kernel name: Intra_8x8.asm
//
// Decoding of Intra_8x8 macroblock
//
//  $Revision: 12 $
//  $Date: 10/18/06 4:10p $
//

// ----------------------------------------------------
//  Main: Intra_8x8
// ----------------------------------------------------

#define	INTRA_8X8

.kernel Intra_8x8
INTRA_8x8:

#ifdef _DEBUG
// WA for FULSIM so we'll know which kernel is being debugged
mov (1) acc0:ud 0x01aa55a5:ud
#endif

#include "SetupForHWMC.asm"

#define		REG_INTRA_PRED_AVAIL	REG_INTRA_TEMP_4
#define		INTRA_PRED_AVAIL		REG_INTRA_TEMP_4.4

// Offset where 8x8 predicted data blocks are stored
#define	PREDBLK0	0*GRFWIB
#define	PREDBLK1	4*GRFWIB
#define	PREDBLK2	8*GRFWIB
#define	PREDBLK3	12*GRFWIB

#ifdef SW_SCOREBOARD

// Update "E" flag with "F" flag information
	mov (1)	REG_INTRA_TEMP_0<1>:w	REG_INTRA_PRED_AVAIL_FLAG_WORD<0;1,0>:w		// Store original Intra_Pred_Avail_Flag
	and.nz.f0.0 (1)	NULLREG	REG_MBAFF_PIC	MBAFF_PIC	// Is current MBAFF picture
	and.z.f0.1 (1) NULLREG	REG_INTRA_PRED_AVAIL_FLAG	INTRA_PRED_LEFT_TH_AVAIL_FLAG	// Is "A" not available?
	(f0.0) and.z.f0.0 (1) NULLREG	REG_FIELD_MACROBLOCK_FLAG	FIELD_MACROBLOCK_FLAG	// Is current frame MB?
	(f0.1) and.nz.f0.1 (1) NULLREG	REG_INTRA_PRED_8X8_BLK2_AVAIL_FLAG	INTRA_PRED_8X8_BLK2_AVAIL_FLAG	// Is "F" flag set?
	(f0.0.allv) or (1)	REG_INTRA_PRED_AVAIL_FLAG_WORD<1>:w	REG_INTRA_PRED_AVAIL_FLAG_WORD<0;1,0>:w	INTRA_PRED_LEFT_BH_AVAIL_FLAG	// Set "E" to 1 if all conditions meet

    CALL(scoreboard_start_intra,1)
	mov (1)	REG_INTRA_PRED_AVAIL_FLAG_WORD<1>:w	REG_INTRA_TEMP_0<0;1,0>:w		// Restore original Intra_Pred_Avail_Flag
#endif

#ifdef SW_SCOREBOARD    
	wait	n0:ud		//	Now wait for scoreboard to response
#endif

//
//  Decode Y blocks
//
//	Load reference data from neighboring macroblocks
    CALL(load_Intra_Ref_Y,1)

	mov	(1)	PERROR<1>:w	ERRBUF*GRFWIB:w			// Pointer to macroblock error data
	mov	(1)	PDECBUF_UD<1>:ud	0x00010001*PREDBUF*GRFWIB+0x00100000:ud	// Pointers to predicted data
	shr (2)	REG_INTRA_PRED_AVAIL<1>:w	REG_INTRA_PRED_AVAIL_FLAG_BYTE<0;1,0>:ub	0x40:v
	
#if 1
	mov (4) REF_LEFT_D(0,0)<1>	0:ud		// This is to make validation easier. Without it, DRAM mismatch occurs.
#endif

//	Intra predict Intra_8x8 luma blocks
//
//	Sub-macroblock 0 *****************
	mov	(16)	REF_TOP_W(0)<1>	REG_INTRA_REF_TOP<16;16,1>:w		// Copy entire top reference data
	and.nz.f0.0 (8)	NULLREG		REG_INTRA_PRED_AVAIL_FLAG	INTRA_PRED_UP_LEFT_AVAIL_FLAG	// Is "D" available?
	(-f0.0) mov (1)	REF_TOP(0,-1)<1>	INTRA_REF_TOP(0)REGION(1,0)		// p[-1,-1] = p[0,-1]

	mov	(8)		REF_LEFT(0,2)<1>	INTRA_REF_LEFT(0)	// Left reference data, (leave 2 for reference filtering)
	(-f0.0) mov (1)		REF_LEFT(0,1)<1>	INTRA_REF_LEFT(0)REGION(1,0)			// p[-1,-1]=p[-1,0]
	(f0.0.any2h)  mov (2)		REF_LEFT(0,0)<1>	INTRA_REF_TOP(0,-1)REGION(1,0)	// p'[-1,y] (y=0,1) = p[-1,-1]
	and.nz.f0.1 (1)	NULLREG		REG_INTRA_PRED_AVAIL_FLAG	INTRA_PRED_UP_AVAIL_FLAG	// Is "B" available?
	(f0.1) mov	(1)		REF_LEFT(0,0)<1>	INTRA_REF_TOP(0,0)REGION(1,0)	// p[0,-1] for left filtering
	and.nz.f0.1 (1)	NULLREG		REG_INTRA_PRED_AVAIL_FLAG	INTRA_PRED_LEFT_TH_AVAIL_FLAG	// Is "A" available?
	(-f0.1) mov	(1)		REF_LEFT(0,2)<1>	INTRA_REF_TOP(0,-1)REGION(1,0)	// p'[-1,2] = p[-1,-1]

	and	(1)		PRED_MODE<1>:w	INTRA_PRED_MODE(0)REGION(1,0)	0x0F:w		// Intra pred mode for current block
	mov (1)		INTRA_PRED_AVAIL<1>:w	REG_INTRA_PRED_AVAIL<0;1,0>:w		// Top/Left neighbor available flags
	CALL(intra_Pred_8x8_Y,1)
    add (1)		PERROR<1>:w	PERROR<0;1,0>:w	0x0080:w	// Pointers to next error block

//	Sub-macroblock 1 *****************
	mov	(16)	REF_TOP0(0)<1>	INTRA_REF_TOP(0,4)	// Top reference data
	and.nz.f0.1 (8)	NULLREG		REG_INTRA_PRED_AVAIL_FLAG	INTRA_PRED_UP_RIGHT_AVAIL_FLAG	// Is "C" available?
	(f0.1.any8h)  mov (8)	REF_TOP(0,8)<1>	INTRA_REF_TOP(0,16)<8;8,1>		// Take data from "C"
	(-f0.1.any8h) mov (8)	REF_TOP(0,8)<1>	INTRA_REF_TOP(0,15)REGION(1,0)	// Repeat last pixel from "B"

	mov	(4)		REF_LEFT(0,2)<1>	DEC_Y(0,14)<16;1,0>		// Left reference data (top half) (leave 2 for reference filtering)
	mov	(4)		REF_LEFT(0,6)<1>	DEC_Y(2,14)<16;1,0>		// Left reference data (bottom half)
	mov	(2)		REF_LEFT(0,0)<1>	INTRA_REF_TOP(0,7)REGION(1,0)		// p'[-1,y] (y=0,1) = p[-1,-1]
	and.nz.f0.1 (1)	NULLREG		REG_INTRA_PRED_AVAIL_FLAG	INTRA_PRED_UP_AVAIL_FLAG	// Is "B" available?
	(f0.1)  mov	(1)		REF_LEFT(0,0)<1>	INTRA_REF_TOP(0,8)REGION(1,0)	// p[-1,-1] for left filtering
	(-f0.1) mov	(1)		REF_LEFT(0,1)<1>	DEC_Y(0,14)REGION(1,0)	// p[-1,-1] = p[-1,0]

	shr	(1)		PRED_MODE<1>:w	INTRA_PRED_MODE(0)REGION(1,0)	4:w		// Intra pred mode for current block
	add	(2)		PPREDBUF_Y<1>:w	PPREDBUF_Y<2;2,1>:w	4*GRFWIB:w			// Pointer to predicted sub-macroblock 1
	or (1)		INTRA_PRED_AVAIL<1>:w	REG_INTRA_PRED_AVAIL<0;1,0>:w	1:w		// Left neighbor is available
	CALL(intra_Pred_8x8_Y,1)
    add (1)		PERROR<1>:w	PERROR<0;1,0>:w	0x0080:w	// Pointers to next error block

//	Pack constructed data from word-aligned to byte-aligned format and interlace Y0 and Y1(every two Y rows)
//	to speed up save_8x8_Y module later
//	PPREDBUF_Y now points to sub-macroblock Y1
	mov (32)	r[PPREDBUF_Y,-PREDBLK1]<1>:ub		DEC_Y(0)<32;16,2> {Compr}	// First 4 Y0 rows
	mov (32)	r[PPREDBUF_Y,0-PREDBLK1+32]<1>:ub	DEC_Y(4)<32;16,2> {Compr}	// First 4 Y1 rows
	mov (32)	r[PPREDBUF_Y,0-PREDBLK1+64]<1>:ub	DEC_Y(2)<32;16,2> {Compr}	// Second 4 Y0 rows
	mov (32)	r[PPREDBUF_Y,0-PREDBLK1+96]<1>:ub	DEC_Y(6)<32;16,2> {Compr}	// Second 4 Y1 rows

//	Sub-macroblock 2 *****************
//	Intra_8x8 special available flag handling
	and.nz.f0.0 (1)	NULLREG	REG_MBAFF_PIC	MBAFF_PIC	// Is current MBAFF picture
	and.z.f0.1 (1) NULLREG	REG_INTRA_PRED_AVAIL_FLAG	INTRA_PRED_LEFT_TH_AVAIL_FLAG	// Is "A" not available?
	(f0.0) and.z.f0.0 (1) NULLREG	REG_FIELD_MACROBLOCK_FLAG	FIELD_MACROBLOCK_FLAG	// Is current frame MB?
	(f0.1) and.nz.f0.1 (1) NULLREG	REG_INTRA_PRED_8X8_BLK2_AVAIL_FLAG	INTRA_PRED_8X8_BLK2_AVAIL_FLAG	// Is special intra_8x8 available flag set?
	(f0.0.allv) mov (1)	REF_TOP(0,-1)<1>	INTRA_REF_LEFT0(0,31)REGION(1,0)	// Top-left reference data
	(f0.0.allv) jmpi (1)	INTRA_8x8_BLK2
//	Done intra_8x8 special available flag handling

	and.nz.f0.0 (8)	NULLREG	REG_INTRA_PRED_AVAIL_FLAG	INTRA_PRED_LEFT_TH_AVAIL_FLAG	// Is top-half "A" available?
	(f0.0.any4h) mov (4)	REF_TOP0(0)<1>		INTRA_REF_LEFT0(0,28)REGION(4,1)	// Top-left reference data
	(-f0.0) mov (1)	REF_TOP(0,-1)<1>	DEC_Y(2,24)REGION(1,0)	// p[-1,-1] = p[0,-1]
INTRA_8x8_BLK2:
	mov	(8)		REF_TOP(0)<1>		DEC_Y(2,24)REGION(8,1)		// Top reference data
	mov	(8)		REF_TOP(0,8)<1>		DEC_Y(3,24)REGION(8,1)		// Top reference data

	mov	(8)		REF_LEFT(0,2)<1>	INTRA_REF_LEFT(1)			// Left reference data,  (leave 2 for reference filtering)
	mov (1)		REF_LEFT(0,0)<1>	DEC_Y(2,24)REGION(1,0)		// p'[-1,0] = p[0,-1] since "B" is always available
	(f0.0) mov	(1)	REF_LEFT(0,1)<1>	INTRA_REF_LEFT(0,28)REGION(1,0)	// p[-1,1] = p[-1,-1] if top-half "A" available
	(-f0.0) mov (1)	REF_LEFT(0,1)<1>	INTRA_REF_LEFT(1)REGION(1,0)	// p[-1,1] = p[-1,0]
	and.nz.f0.1 (1)	NULLREG	REG_INTRA_PRED_AVAIL_FLAG	INTRA_PRED_LEFT_BH_AVAIL_FLAG	// Is bottom-half "A" available?
	(-f0.1) mov	(1)	REF_LEFT(0,2)<1>	INTRA_REF_LEFT(0,28)REGION(1,0)	// p'[-1,2] = p[-1,-1]

	and	(1)		PRED_MODE<1>:w			INTRA_PRED_MODE(0,1)REGION(1,0)	0x0F:w	// Intra pred mode for current block
	or (1)		INTRA_PRED_AVAIL<1>:w	REG_INTRA_PRED_AVAIL.1<0;1,0>:w	2:w		// Top neighbor is available
	CALL(intra_Pred_8x8_Y,1)
    add (1)		PERROR<1>:w	PERROR<0;1,0>:w	0x0080:w	// Pointers to next error block

//	Sub-macroblock 3 *****************
	mov	(4)		REF_TOP0(0)<1>		DEC_Y(2,28)REGION(4,1)		// Top-left reference data
	mov	(8)		REF_TOP(0)<1>		DEC_Y(3,24)REGION(8,1)		// Top reference data
	mov	(16)	REF_TOP(0,8)<1>		DEC_Y(3,31)REGION(1,0)		// Top-right reference data

	mov (4)		REF_LEFT(0,2)<1>	DEC_Y(4,14)<16;1,0>		// Left reference data (top half) (leave 2 for reference filtering)
	mov (4)		REF_LEFT(0,6)<1>	DEC_Y(6,14)<16;1,0>		// Left reference data (bottom half)
	mov (1)		REF_LEFT(0,0)<1>	DEC_Y(3,24)REGION(1,0)	// p[-1,0] = p[0,-1]
	mov (1)		REF_LEFT(0,1)<1>	DEC_Y(2,31)REGION(1,0)	// p[-1,1] = p[-1,-1]

	shr	(1)		PRED_MODE<1>:w	INTRA_PRED_MODE(0,1)REGION(1,0)	4:w		// Intra pred mode for current block
	add	(2)		PPREDBUF_Y<1>:w	PPREDBUF_Y<2;2,1>:w	4*GRFWIB:w	// Pointer to predicted sub-macroblock 3
	or (1)		INTRA_PRED_AVAIL<1>:w	REG_INTRA_PRED_AVAIL<0;1,0>:w	3:w		// Top and Left neighbor are available
	CALL(intra_Pred_8x8_Y,1)

//	Pack constructed data from word-aligned to byte-aligned format
//	to speed up save_8x8_Y module later
//	PPREDBUF_Y now points to sub-macroblock Y1
	mov (32)	r[PPREDBUF_Y,-PREDBLK1]<1>:ub		DEC_Y(4)<32;16,2> {Compr}	// First 4 Y2 rows
	mov (32)	r[PPREDBUF_Y,0-PREDBLK1+32]<1>:ub	DEC_Y(8)<32;16,2> {Compr}	// First 4 Y3 rows
	mov (32)	r[PPREDBUF_Y,0-PREDBLK1+64]<1>:ub	DEC_Y(6)<32;16,2> {Compr}	// Second 4 Y2 rows
	mov (32)	r[PPREDBUF_Y,0-PREDBLK1+96]<1>:ub	DEC_Y(10)<32;16,2> {Compr}	// Second 4 Y3 rows

//	All 4 sub-macroblock (containing 4 intra_8x8 blocks) have be constructed
//	Save constructed Y picture
	CALL(save_8x8_Y,1)		// Save Intra_8x8 predicted luma data.
//
//  Decode U/V blocks
//
//	Note: The decoding for chroma blocks will be the same for all intra prediction mode
//
	CALL(decode_Chroma_Intra,1)

#ifdef SW_SCOREBOARD
    #include "scoreboard_update.asm"
#endif

// Terminate the thread
//
    #include "EndIntraThread.asm"

// End of Intra_8x8