Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

793 lines
25 KiB

3 years ago
  1. unit imjidctasm;
  2. { This file contains a slow-but-accurate integer implementation of the
  3. inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
  4. must also perform dequantization of the input coefficients.
  5. A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
  6. on each row (or vice versa, but it's more convenient to emit a row at
  7. a time). Direct algorithms are also available, but they are much more
  8. complex and seem not to be any faster when reduced to code.
  9. This implementation is based on an algorithm described in
  10. C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
  11. Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
  12. Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
  13. The primary algorithm described there uses 11 multiplies and 29 adds.
  14. We use their alternate method with 12 multiplies and 32 adds.
  15. The advantage of this method is that no data path contains more than one
  16. multiplication; this allows a very simple and accurate implementation in
  17. scaled fixed-point arithmetic, with a minimal number of shifts. }
  18. { Original : jidctint.c ; Copyright (C) 1991-1996, Thomas G. Lane. }
  19. { ;-------------------------------------------------------------------------
  20. ; JIDCTINT.ASM
  21. ; 80386 protected mode assembly translation of JIDCTINT.C
  22. ; **** Optimized to all hell by Jason M. Felice (jasonf@apk.net) ****
  23. ; **** E-mail welcome ****
  24. ;
  25. ; ** This code does not make O/S calls -- use it for OS/2, Win95, WinNT,
  26. ; ** DOS prot. mode., Linux, whatever... have fun.
  27. ;
  28. ; ** Note, this code is dependant on the structure member order in the .h
  29. ; ** files for the following structures:
  30. ; -- amazingly NOT j_decompress_struct... cool.
  31. ; -- jpeg_component_info (dependant on position of dct_table element)
  32. ;
  33. ; Originally created with the /Fa option of MSVC 4.0 (why work when you
  34. ; don't have to?)
  35. ;
  36. ; (this code, when compiled is 1K bytes smaller than the optimized MSVC
  37. ; release build, not to mention 120-130 ms faster in my profile test with 1
  38. ; small color and and 1 medium black-and-white jpeg: stats using TASM 4.0
  39. ; and MSVC 4.0 to create a non-console app; jpeg_idct_islow accumulated
  40. ; 5,760 hits on all trials)
  41. ;
  42. ; TASM -t -ml -os jidctint.asm, jidctint.obj
  43. ;-------------------------------------------------------------------------
  44. Converted to Delphi 2.0 BASM for PasJPEG
  45. by Jacques NOMSSI NZALI <nomssi@physik.tu-chemnitz.de>
  46. October 13th 1996
  47. * assumes Delphi "register" calling convention
  48. first 3 parameter are in EAX,EDX,ECX
  49. * register allocation revised
  50. }
  51. interface
  52. {$I imjconfig.inc}
  53. uses
  54. imjmorecfg,
  55. imjinclude,
  56. imjpeglib,
  57. imjdct; { Private declarations for DCT subsystem }
  58. { Perform dequantization and inverse DCT on one block of coefficients. }
  59. {GLOBAL}
  60. procedure jpeg_idct_islow (cinfo : j_decompress_ptr;
  61. compptr : jpeg_component_info_ptr;
  62. coef_block : JCOEFPTR;
  63. output_buf : JSAMPARRAY;
  64. output_col : JDIMENSION);
  65. implementation
  66. { This module is specialized to the case DCTSIZE = 8. }
  67. {$ifndef DCTSIZE_IS_8}
  68. Sorry, this code only copes with 8x8 DCTs. { deliberate syntax err }
  69. {$endif}
  70. { The poop on this scaling stuff is as follows:
  71. Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
  72. larger than the true IDCT outputs. The final outputs are therefore
  73. a factor of N larger than desired; since N=8 this can be cured by
  74. a simple right shift at the end of the algorithm. The advantage of
  75. this arrangement is that we save two multiplications per 1-D IDCT,
  76. because the y0 and y4 inputs need not be divided by sqrt(N).
  77. We have to do addition and subtraction of the integer inputs, which
  78. is no problem, and multiplication by fractional constants, which is
  79. a problem to do in integer arithmetic. We multiply all the constants
  80. by CONST_SCALE and convert them to integer constants (thus retaining
  81. CONST_BITS bits of precision in the constants). After doing a
  82. multiplication we have to divide the product by CONST_SCALE, with proper
  83. rounding, to produce the correct output. This division can be done
  84. cheaply as a right shift of CONST_BITS bits. We postpone shifting
  85. as long as possible so that partial sums can be added together with
  86. full fractional precision.
  87. The outputs of the first pass are scaled up by PASS1_BITS bits so that
  88. they are represented to better-than-integral precision. These outputs
  89. require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
  90. with the recommended scaling. (To scale up 12-bit sample data further, an
  91. intermediate INT32 array would be needed.)
  92. To avoid overflow of the 32-bit intermediate results in pass 2, we must
  93. have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
  94. shows that the values given below are the most effective. }
  95. const
  96. CONST_BITS = 13;
  97. {$ifdef BITS_IN_JSAMPLE_IS_8}
  98. const
  99. PASS1_BITS = 2;
  100. {$else}
  101. const
  102. PASS1_BITS = 1; { lose a little precision to avoid overflow }
  103. {$endif}
  104. const
  105. CONST_SCALE = (INT32(1) shl CONST_BITS);
  106. const
  107. FIX_0_298631336 = INT32(Round(CONST_SCALE * 0.298631336)); {2446}
  108. FIX_0_390180644 = INT32(Round(CONST_SCALE * 0.390180644)); {3196}
  109. FIX_0_541196100 = INT32(Round(CONST_SCALE * 0.541196100)); {4433}
  110. FIX_0_765366865 = INT32(Round(CONST_SCALE * 0.765366865)); {6270}
  111. FIX_0_899976223 = INT32(Round(CONST_SCALE * 0.899976223)); {7373}
  112. FIX_1_175875602 = INT32(Round(CONST_SCALE * 1.175875602)); {9633}
  113. FIX_1_501321110 = INT32(Round(CONST_SCALE * 1.501321110)); {12299}
  114. FIX_1_847759065 = INT32(Round(CONST_SCALE * 1.847759065)); {15137}
  115. FIX_1_961570560 = INT32(Round(CONST_SCALE * 1.961570560)); {16069}
  116. FIX_2_053119869 = INT32(Round(CONST_SCALE * 2.053119869)); {16819}
  117. FIX_2_562915447 = INT32(Round(CONST_SCALE * 2.562915447)); {20995}
  118. FIX_3_072711026 = INT32(Round(CONST_SCALE * 3.072711026)); {25172}
  119. { for DESCALE }
  120. const
  121. ROUND_CONST = (INT32(1) shl (CONST_BITS-PASS1_BITS-1));
  122. const
  123. ROUND_CONST_2 = (INT32(1) shl (CONST_BITS+PASS1_BITS+3-1));
  124. { Perform dequantization and inverse DCT on one block of coefficients. }
  125. {GLOBAL}
  126. procedure jpeg_idct_islow (cinfo : j_decompress_ptr;
  127. compptr : jpeg_component_info_ptr;
  128. coef_block : JCOEFPTR;
  129. output_buf : JSAMPARRAY;
  130. output_col : JDIMENSION);
  131. type
  132. PWorkspace = ^TWorkspace;
  133. TWorkspace = coef_bits_field; { buffers data between passes }
  134. const
  135. coefDCTSIZE = DCTSIZE*SizeOf(JCOEF);
  136. wrkDCTSIZE = DCTSIZE*SizeOf(int);
  137. var
  138. tmp0, tmp1, tmp2, tmp3 : INT32;
  139. tmp10, tmp11, tmp12, tmp13 : INT32;
  140. z1, z2, z3, z4, z5 : INT32;
  141. var
  142. inptr : JCOEFPTR;
  143. quantptr : ISLOW_MULT_TYPE_FIELD_PTR;
  144. wsptr : PWorkspace;
  145. outptr : JSAMPROW;
  146. var
  147. range_limit : JSAMPROW;
  148. ctr : int;
  149. workspace : TWorkspace;
  150. var
  151. dcval : int;
  152. var
  153. dcval_ : JSAMPLE;
  154. asm
  155. push edi
  156. push esi
  157. push ebx
  158. cld { The only direction we use, might as well set it now, as opposed }
  159. { to inside 2 loops. }
  160. { Each IDCT routine is responsible for range-limiting its results and
  161. converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could
  162. be quite far out of range if the input data is corrupt, so a bulletproof
  163. range-limiting step is required. We use a mask-and-table-lookup method
  164. to do the combined operations quickly. See the comments with
  165. prepare_range_limit_table (in jdmaster.c) for more info. }
  166. {range_limit := JSAMPROW(@(cinfo^.sample_range_limit^[CENTERJSAMPLE]));}
  167. mov eax, [eax].jpeg_decompress_struct.sample_range_limit {eax=cinfo}
  168. add eax, (MAXJSAMPLE+1 + CENTERJSAMPLE)*(Type JSAMPLE)
  169. mov range_limit, eax
  170. { Pass 1: process columns from input, store into work array. }
  171. { Note results are scaled up by sqrt(8) compared to a true IDCT; }
  172. { furthermore, we scale the results by 2**PASS1_BITS. }
  173. {inptr := coef_block;}
  174. mov esi, ecx { ecx=coef_block }
  175. {quantptr := ISLOW_MULT_TYPE_FIELD_PTR (compptr^.dct_table);}
  176. mov edi, [edx].jpeg_component_info.dct_table { edx=compptr }
  177. {wsptr := PWorkspace(@workspace);}
  178. lea ecx, workspace
  179. {for ctr := pred(DCTSIZE) downto 0 do
  180. begin}
  181. mov ctr, DCTSIZE
  182. @loop518:
  183. { Due to quantization, we will usually find that many of the input
  184. coefficients are zero, especially the AC terms. We can exploit this
  185. by short-circuiting the IDCT calculation for any column in which all
  186. the AC terms are zero. In that case each output is equal to the
  187. DC coefficient (with scale factor as needed).
  188. With typical images and quantization tables, half or more of the
  189. column DCT calculations can be simplified this way. }
  190. {if ((inptr^[DCTSIZE*1]) or (inptr^[DCTSIZE*2]) or (inptr^[DCTSIZE*3]) or
  191. (inptr^[DCTSIZE*4]) or (inptr^[DCTSIZE*5]) or (inptr^[DCTSIZE*6]) or
  192. (inptr^[DCTSIZE*7]) = 0) then
  193. begin}
  194. mov eax, DWORD PTR [esi+coefDCTSIZE*1]
  195. or eax, DWORD PTR [esi+coefDCTSIZE*2]
  196. or eax, DWORD PTR [esi+coefDCTSIZE*3]
  197. mov edx, DWORD PTR [esi+coefDCTSIZE*4]
  198. or eax, edx
  199. or eax, DWORD PTR [esi+coefDCTSIZE*5]
  200. or eax, DWORD PTR [esi+coefDCTSIZE*6]
  201. or eax, DWORD PTR [esi+coefDCTSIZE*7]
  202. jne @loop520
  203. { AC terms all zero }
  204. {dcval := ISLOW_MULT_TYPE(inptr^[DCTSIZE*0]) *
  205. (quantptr^[DCTSIZE*0]) shl PASS1_BITS;}
  206. mov eax, DWORD PTR [esi+coefDCTSIZE*0]
  207. imul eax, DWORD PTR [edi+wrkDCTSIZE*0]
  208. shl eax, PASS1_BITS
  209. {wsptr^[DCTSIZE*0] := dcval;
  210. wsptr^[DCTSIZE*1] := dcval;
  211. wsptr^[DCTSIZE*2] := dcval;
  212. wsptr^[DCTSIZE*3] := dcval;
  213. wsptr^[DCTSIZE*4] := dcval;
  214. wsptr^[DCTSIZE*5] := dcval;
  215. wsptr^[DCTSIZE*6] := dcval;
  216. wsptr^[DCTSIZE*7] := dcval;}
  217. mov DWORD PTR [ecx+ wrkDCTSIZE*0], eax
  218. mov DWORD PTR [ecx+ wrkDCTSIZE*1], eax
  219. mov DWORD PTR [ecx+ wrkDCTSIZE*2], eax
  220. mov DWORD PTR [ecx+ wrkDCTSIZE*3], eax
  221. mov DWORD PTR [ecx+ wrkDCTSIZE*4], eax
  222. mov DWORD PTR [ecx+ wrkDCTSIZE*5], eax
  223. mov DWORD PTR [ecx+ wrkDCTSIZE*6], eax
  224. mov DWORD PTR [ecx+ wrkDCTSIZE*7], eax
  225. {Inc(JCOEF_PTR(inptr)); { advance pointers to next column }
  226. {Inc(ISLOW_MULT_TYPE_PTR(quantptr));
  227. Inc(int_ptr(wsptr));
  228. continue;}
  229. dec ctr
  230. je @loop519
  231. add esi, Type JCOEF
  232. add edi, Type ISLOW_MULT_TYPE
  233. add ecx, Type int { int_ptr }
  234. jmp @loop518
  235. @loop520:
  236. {end;}
  237. { Even part: reverse the even part of the forward DCT. }
  238. { The rotator is sqrt(2)*c(-6). }
  239. {z2 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*2]) * quantptr^[DCTSIZE*2];
  240. z3 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*6]) * quantptr^[DCTSIZE*6];
  241. z1 := (z2 + z3) * INT32(FIX_0_541196100);
  242. tmp2 := z1 + INT32(z3) * INT32(- FIX_1_847759065);
  243. tmp3 := z1 + INT32(z2) * INT32(FIX_0_765366865);}
  244. mov edx, DWORD PTR [esi+coefDCTSIZE*2]
  245. imul edx, DWORD PTR [edi+wrkDCTSIZE*2] {z2}
  246. mov eax, DWORD PTR [esi+coefDCTSIZE*6]
  247. imul eax, DWORD PTR [edi+wrkDCTSIZE*6] {z3}
  248. lea ebx, [eax+edx]
  249. imul ebx, FIX_0_541196100 {z1}
  250. imul eax, (-FIX_1_847759065)
  251. add eax, ebx
  252. mov tmp2, eax
  253. imul edx, FIX_0_765366865
  254. add edx, ebx
  255. mov tmp3, edx
  256. {z2 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*0]) * quantptr^[DCTSIZE*0];
  257. z3 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*4]) * quantptr^[DCTSIZE*4];}
  258. mov edx, DWORD PTR [esi+coefDCTSIZE*4]
  259. imul edx, DWORD PTR [edi+wrkDCTSIZE*4] { z3 = edx }
  260. mov eax, DWORD PTR [esi+coefDCTSIZE*0]
  261. imul eax, DWORD PTR [edi+wrkDCTSIZE*0] { z2 = eax }
  262. {tmp0 := (z2 + z3) shl CONST_BITS;
  263. tmp1 := (z2 - z3) shl CONST_BITS;}
  264. lea ebx,[eax+edx]
  265. sub eax, edx
  266. shl ebx, CONST_BITS { tmp0 = ebx }
  267. shl eax, CONST_BITS { tmp1 = eax }
  268. {tmp10 := tmp0 + tmp3;
  269. tmp13 := tmp0 - tmp3;}
  270. mov edx, tmp3
  271. sub ebx, edx
  272. mov tmp13, ebx
  273. add edx, edx
  274. add ebx, edx
  275. mov tmp10, ebx
  276. {tmp11 := tmp1 + tmp2;
  277. tmp12 := tmp1 - tmp2;}
  278. mov ebx, tmp2
  279. sub eax, ebx
  280. mov tmp12, eax
  281. add ebx, ebx
  282. add eax, ebx
  283. mov tmp11, eax
  284. { Odd part per figure 8; the matrix is unitary and hence its
  285. transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. }
  286. {tmp0 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*7]) * quantptr^[DCTSIZE*7];}
  287. mov eax, DWORD PTR [esi+coefDCTSIZE*7]
  288. imul eax, DWORD PTR [edi+wrkDCTSIZE*7]
  289. mov edx, eax { edx = tmp0 }
  290. {tmp0 := (tmp0) * INT32(FIX_0_298631336); { sqrt(2) * (-c1+c3+c5-c7) }
  291. imul eax, FIX_0_298631336
  292. mov tmp0, eax
  293. {tmp3 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*1]) * quantptr^[DCTSIZE*1];}
  294. mov eax, DWORD PTR [esi+coefDCTSIZE*1]
  295. imul eax, DWORD PTR [edi+wrkDCTSIZE*1]
  296. mov tmp3, eax
  297. {z1 := tmp0 + tmp3;}
  298. {z1 := (z1) * INT32(- FIX_0_899976223); { sqrt(2) * (c7-c3) }
  299. add eax, edx
  300. imul eax, (-FIX_0_899976223)
  301. mov z1, eax
  302. {tmp1 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*5]) * quantptr^[DCTSIZE*5];}
  303. mov eax, DWORD PTR [esi+coefDCTSIZE*5]
  304. imul eax, DWORD PTR [edi+wrkDCTSIZE*5]
  305. mov ebx, eax { ebx = tmp1 }
  306. {tmp1 := (tmp1) * INT32(FIX_2_053119869); { sqrt(2) * ( c1+c3-c5+c7) }
  307. imul eax, FIX_2_053119869
  308. mov tmp1, eax
  309. {tmp2 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*3]) * quantptr^[DCTSIZE*3];}
  310. mov eax, DWORD PTR [esi+coefDCTSIZE*3]
  311. imul eax, DWORD PTR [edi+wrkDCTSIZE*3]
  312. mov tmp2, eax
  313. {z3 := tmp0 + tmp2;}
  314. add edx, eax { edx = z3 }
  315. {z2 := tmp1 + tmp2;}
  316. {z2 := (z2) * INT32(- FIX_2_562915447); { sqrt(2) * (-c1-c3) }
  317. add eax, ebx
  318. imul eax, (-FIX_2_562915447)
  319. mov z2, eax
  320. {z4 := tmp1 + tmp3;}
  321. add ebx, tmp3 { ebx = z4 }
  322. {z5 := INT32(z3 + z4) * INT32(FIX_1_175875602); { sqrt(2) * c3 }
  323. lea eax, [edx+ebx]
  324. imul eax, FIX_1_175875602 { eax = z5 }
  325. {z4 := (z4) * INT32(- FIX_0_390180644); { sqrt(2) * (c5-c3) }
  326. {Inc(z4, z5);}
  327. imul ebx, (-FIX_0_390180644)
  328. add ebx, eax
  329. mov z4, ebx
  330. {z3 := (z3) * INT32(- FIX_1_961570560); { sqrt(2) * (-c3-c5) }
  331. {Inc(z3, z5);}
  332. imul edx, (-FIX_1_961570560)
  333. add eax, edx { z3 = eax }
  334. {Inc(tmp0, z1 + z3);}
  335. mov ebx, z1
  336. add ebx, eax
  337. add tmp0, ebx
  338. {tmp2 := (tmp2) * INT32(FIX_3_072711026); { sqrt(2) * ( c1+c3+c5-c7) }
  339. {Inc(tmp2, z2 + z3);}
  340. mov ebx, tmp2
  341. imul ebx, FIX_3_072711026
  342. mov edx, z2 { z2 = edx }
  343. add ebx, edx
  344. add eax, ebx
  345. mov tmp2, eax
  346. {Inc(tmp1, z2 + z4);}
  347. mov eax, z4 { z4 = eax }
  348. add edx, eax
  349. add tmp1, edx
  350. {tmp3 := (tmp3) * INT32(FIX_1_501321110); { sqrt(2) * ( c1+c3-c5-c7) }
  351. {Inc(tmp3, z1 + z4);}
  352. mov edx, tmp3
  353. imul edx, FIX_1_501321110
  354. add edx, eax
  355. add edx, z1 { tmp3 = edx }
  356. { Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 }
  357. {wsptr^[DCTSIZE*0] := int (DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS));}
  358. {wsptr^[DCTSIZE*7] := int (DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS));}
  359. mov eax, tmp10
  360. add eax, ROUND_CONST
  361. lea ebx, [eax+edx]
  362. sar ebx, CONST_BITS-PASS1_BITS
  363. mov DWORD PTR [ecx+wrkDCTSIZE*0], ebx
  364. sub eax, edx
  365. sar eax, CONST_BITS-PASS1_BITS
  366. mov DWORD PTR [ecx+wrkDCTSIZE*7], eax
  367. {wsptr^[DCTSIZE*1] := int (DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS));}
  368. {wsptr^[DCTSIZE*6] := int (DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS));}
  369. mov eax, tmp11
  370. add eax, ROUND_CONST
  371. mov edx, tmp2
  372. lea ebx, [eax+edx]
  373. sar ebx, CONST_BITS-PASS1_BITS
  374. mov DWORD PTR [ecx+wrkDCTSIZE*1], ebx
  375. sub eax, edx
  376. sar eax, CONST_BITS-PASS1_BITS
  377. mov DWORD PTR [ecx+wrkDCTSIZE*6], eax
  378. {wsptr^[DCTSIZE*2] := int (DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS));}
  379. {wsptr^[DCTSIZE*5] := int (DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS));}
  380. mov eax, tmp12
  381. add eax, ROUND_CONST
  382. mov edx, tmp1
  383. lea ebx, [eax+edx]
  384. sar ebx, CONST_BITS-PASS1_BITS
  385. mov DWORD PTR [ecx+wrkDCTSIZE*2], ebx
  386. sub eax, edx
  387. sar eax, CONST_BITS-PASS1_BITS
  388. mov DWORD PTR [ecx+wrkDCTSIZE*5], eax
  389. {wsptr^[DCTSIZE*3] := int (DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS));}
  390. {wsptr^[DCTSIZE*4] := int (DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS));}
  391. mov eax, tmp13
  392. add eax, ROUND_CONST
  393. mov edx, tmp0
  394. lea ebx, [eax+edx]
  395. sar ebx, CONST_BITS-PASS1_BITS
  396. mov DWORD PTR [ecx+wrkDCTSIZE*3], ebx
  397. sub eax, edx
  398. sar eax, CONST_BITS-PASS1_BITS
  399. mov DWORD PTR [ecx+wrkDCTSIZE*4], eax
  400. {Inc(JCOEF_PTR(inptr)); { advance pointers to next column }
  401. {Inc(ISLOW_MULT_TYPE_PTR(quantptr));
  402. Inc(int_ptr(wsptr));}
  403. dec ctr
  404. je @loop519
  405. add esi, Type JCOEF
  406. add edi, Type ISLOW_MULT_TYPE
  407. add ecx, Type int { int_ptr }
  408. {end;}
  409. jmp @loop518
  410. @loop519:
  411. { Save to memory what we've registerized for the preceding loop. }
  412. { Pass 2: process rows from work array, store into output array. }
  413. { Note that we must descale the results by a factor of 8 == 2**3, }
  414. { and also undo the PASS1_BITS scaling. }
  415. {wsptr := @workspace;}
  416. lea esi, workspace
  417. {for ctr := 0 to pred(DCTSIZE) do
  418. begin}
  419. mov ctr, 0
  420. @loop523:
  421. {outptr := output_buf^[ctr];}
  422. mov eax, ctr
  423. mov ebx, output_buf
  424. mov edi, DWORD PTR [ebx+eax*4] { 4 = SizeOf(pointer) }
  425. {Inc(JSAMPLE_PTR(outptr), output_col);}
  426. add edi, LongWord(output_col)
  427. { Rows of zeroes can be exploited in the same way as we did with columns.
  428. However, the column calculation has created many nonzero AC terms, so
  429. the simplification applies less often (typically 5% to 10% of the time).
  430. On machines with very fast multiplication, it's possible that the
  431. test takes more time than it's worth. In that case this section
  432. may be commented out. }
  433. {$ifndef NO_ZERO_ROW_TEST}
  434. {if ((wsptr^[1]) or (wsptr^[2]) or (wsptr^[3]) or (wsptr^[4]) or
  435. (wsptr^[5]) or (wsptr^[6]) or (wsptr^[7]) = 0) then
  436. begin}
  437. mov eax, DWORD PTR [esi+4*1]
  438. or eax, DWORD PTR [esi+4*2]
  439. or eax, DWORD PTR [esi+4*3]
  440. jne @loop525 { Nomssi: early exit path may help }
  441. or eax, DWORD PTR [esi+4*4]
  442. or eax, DWORD PTR [esi+4*5]
  443. or eax, DWORD PTR [esi+4*6]
  444. or eax, DWORD PTR [esi+4*7]
  445. jne @loop525
  446. { AC terms all zero }
  447. {JSAMPLE(dcval_) := range_limit^[int(DESCALE(INT32(wsptr^[0]),
  448. PASS1_BITS+3)) and RANGE_MASK];}
  449. mov eax, DWORD PTR [esi+4*0]
  450. add eax, (INT32(1) shl (PASS1_BITS+3-1))
  451. sar eax, PASS1_BITS+3
  452. and eax, RANGE_MASK
  453. mov ebx, range_limit
  454. mov al, BYTE PTR [ebx+eax]
  455. mov ah, al
  456. {outptr^[0] := dcval_;
  457. outptr^[1] := dcval_;
  458. outptr^[2] := dcval_;
  459. outptr^[3] := dcval_;
  460. outptr^[4] := dcval_;
  461. outptr^[5] := dcval_;
  462. outptr^[6] := dcval_;
  463. outptr^[7] := dcval_;}
  464. stosw
  465. stosw
  466. stosw
  467. stosw
  468. {Inc(int_ptr(wsptr), DCTSIZE); { advance pointer to next row }
  469. {continue;}
  470. add esi, wrkDCTSIZE
  471. inc ctr
  472. cmp ctr, DCTSIZE
  473. jl @loop523
  474. jmp @loop524
  475. {end;}
  476. @loop525:
  477. {$endif}
  478. { Even part: reverse the even part of the forward DCT. }
  479. { The rotator is sqrt(2)*c(-6). }
  480. {z2 := INT32 (wsptr^[2]);}
  481. mov edx, DWORD PTR [esi+4*2] { z2 = edx }
  482. {z3 := INT32 (wsptr^[6]);}
  483. mov ecx, DWORD PTR [esi+4*6] { z3 = ecx }
  484. {z1 := (z2 + z3) * INT32(FIX_0_541196100);}
  485. lea eax, [edx+ecx]
  486. imul eax, FIX_0_541196100
  487. mov ebx, eax { z1 = ebx }
  488. {tmp2 := z1 + (z3) * INT32(- FIX_1_847759065);}
  489. imul ecx, (-FIX_1_847759065)
  490. add ecx, ebx { tmp2 = ecx }
  491. {tmp3 := z1 + (z2) * INT32(FIX_0_765366865);}
  492. imul edx, FIX_0_765366865
  493. add ebx, edx { tmp3 = ebx }
  494. {tmp0 := (INT32(wsptr^[0]) + INT32(wsptr^[4])) shl CONST_BITS;}
  495. {tmp1 := (INT32(wsptr^[0]) - INT32(wsptr^[4])) shl CONST_BITS;}
  496. mov edx, DWORD PTR [esi+4*4]
  497. mov eax, DWORD PTR [esi+4*0]
  498. sub eax, edx
  499. add edx, edx
  500. add edx, eax
  501. shl edx, CONST_BITS { tmp0 = edx }
  502. shl eax, CONST_BITS { tmp1 = eax }
  503. {tmp10 := tmp0 + tmp3;}
  504. {tmp13 := tmp0 - tmp3;}
  505. sub edx, ebx
  506. mov tmp13, edx
  507. add ebx, ebx
  508. add edx, ebx
  509. mov tmp10, edx
  510. {tmp11 := tmp1 + tmp2;}
  511. {tmp12 := tmp1 - tmp2;}
  512. lea ebx, [ecx+eax]
  513. mov tmp11, ebx
  514. sub eax, ecx
  515. mov tmp12, eax
  516. { Odd part per figure 8; the matrix is unitary and hence its
  517. transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. }
  518. { The following lines no longer produce code, since wsptr has been
  519. optimized to esi, it is more efficient to access these values
  520. directly.
  521. tmp0 := INT32(wsptr^[7]);
  522. tmp1 := INT32(wsptr^[5]);
  523. tmp2 := INT32(wsptr^[3]);
  524. tmp3 := INT32(wsptr^[1]); }
  525. {z2 := tmp1 + tmp2;}
  526. {z2 := (z2) * INT32(- FIX_2_562915447); { sqrt(2) * (-c1-c3) }
  527. mov ebx, DWORD PTR [esi+4*3] { tmp2 }
  528. mov ecx, DWORD PTR [esi+4*5] { tmp1 }
  529. lea eax, [ebx+ecx]
  530. imul eax, (-FIX_2_562915447)
  531. mov z2, eax
  532. {z3 := tmp0 + tmp2;}
  533. mov edx, DWORD PTR [esi+4*7] { tmp0 }
  534. add ebx, edx { old z3 = ebx }
  535. mov eax, ebx
  536. {z3 := (z3) * INT32(- FIX_1_961570560); { sqrt(2) * (-c3-c5) }
  537. imul eax, (-FIX_1_961570560)
  538. mov z3, eax
  539. {z1 := tmp0 + tmp3;}
  540. {z1 := (z1) * INT32(- FIX_0_899976223); { sqrt(2) * (c7-c3) }
  541. mov eax, DWORD PTR [esi+4*1] { tmp3 }
  542. add edx, eax
  543. imul edx, (-FIX_0_899976223) { z1 = edx }
  544. {z4 := tmp1 + tmp3;}
  545. add eax, ecx { +tmp1 }
  546. add ebx, eax { z3 + z4 = ebx }
  547. {z4 := (z4) * INT32(- FIX_0_390180644); { sqrt(2) * (c5-c3) }
  548. imul eax, (-FIX_0_390180644) { z4 = eax }
  549. {z5 := (z3 + z4) * INT32(FIX_1_175875602); { sqrt(2) * c3 }
  550. {Inc(z3, z5);}
  551. imul ebx, FIX_1_175875602
  552. mov ecx, z3
  553. add ecx, ebx { ecx = z3 }
  554. {Inc(z4, z5);}
  555. add ebx, eax { z4 = ebx }
  556. {tmp0 := (tmp0) * INT32(FIX_0_298631336); { sqrt(2) * (-c1+c3+c5-c7) }
  557. {Inc(tmp0, z1 + z3);}
  558. mov eax, DWORD PTR [esi+4*7]
  559. imul eax, FIX_0_298631336
  560. add eax, edx
  561. add eax, ecx
  562. mov tmp0, eax
  563. {tmp1 := (tmp1) * INT32(FIX_2_053119869); { sqrt(2) * ( c1+c3-c5+c7) }
  564. {Inc(tmp1, z2 + z4);}
  565. mov eax, DWORD PTR [esi+4*5]
  566. imul eax, FIX_2_053119869
  567. add eax, z2
  568. add eax, ebx
  569. mov tmp1, eax
  570. {tmp2 := (tmp2) * INT32(FIX_3_072711026); { sqrt(2) * ( c1+c3+c5-c7) }
  571. {Inc(tmp2, z2 + z3);}
  572. mov eax, DWORD PTR [esi+4*3]
  573. imul eax, FIX_3_072711026
  574. add eax, z2
  575. add ecx, eax { ecx = tmp2 }
  576. {tmp3 := (tmp3) * INT32(FIX_1_501321110); { sqrt(2) * ( c1+c3-c5-c7) }
  577. {Inc(tmp3, z1 + z4);}
  578. mov eax, DWORD PTR [esi+4*1]
  579. imul eax, FIX_1_501321110
  580. add eax, edx
  581. add ebx, eax { ebx = tmp3 }
  582. { Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 }
  583. {outptr^[0] := range_limit^[ int(DESCALE(tmp10 + tmp3,
  584. CONST_BITS+PASS1_BITS+3)) and RANGE_MASK]; }
  585. {outptr^[7] := range_limit^[ int(DESCALE(tmp10 - tmp3,
  586. CONST_BITS+PASS1_BITS+3)) and RANGE_MASK];}
  587. mov edx, tmp10
  588. add edx, ROUND_CONST_2
  589. lea eax, [ebx+edx]
  590. sub edx, ebx
  591. shr eax, CONST_BITS+PASS1_BITS+3
  592. and eax, RANGE_MASK
  593. mov ebx, range_limit { once for all }
  594. mov al, BYTE PTR [ebx+eax]
  595. mov [edi+0], al
  596. shr edx, CONST_BITS+PASS1_BITS+3
  597. and edx, RANGE_MASK
  598. mov al, BYTE PTR [ebx+edx]
  599. mov [edi+7], al
  600. {outptr^[1] := range_limit^[ int(DESCALE(tmp11 + tmp2,
  601. CONST_BITS+PASS1_BITS+3)) and RANGE_MASK];}
  602. mov eax, tmp11
  603. add eax, ROUND_CONST_2
  604. lea edx, [eax+ecx]
  605. shr edx, CONST_BITS+PASS1_BITS+3
  606. and edx, RANGE_MASK
  607. mov dl, BYTE PTR [ebx+edx]
  608. mov [edi+1], dl
  609. {outptr^[6] := range_limit^[ int(DESCALE(tmp11 - tmp2,
  610. CONST_BITS+PASS1_BITS+3)) and RANGE_MASK];}
  611. sub eax, ecx
  612. shr eax, CONST_BITS+PASS1_BITS+3
  613. and eax, RANGE_MASK
  614. mov al, BYTE PTR [ebx+eax]
  615. mov [edi+6], al
  616. {outptr^[2] := range_limit^[ int(DESCALE(tmp12 + tmp1,
  617. CONST_BITS+PASS1_BITS+3)) and RANGE_MASK];}
  618. mov eax, tmp12
  619. add eax, ROUND_CONST_2
  620. mov ecx, tmp1
  621. lea edx, [eax+ecx]
  622. shr edx, CONST_BITS+PASS1_BITS+3
  623. and edx, RANGE_MASK
  624. mov dl, BYTE PTR [ebx+edx]
  625. mov [edi+2], dl
  626. {outptr^[5] := range_limit^[ int(DESCALE(tmp12 - tmp1,
  627. CONST_BITS+PASS1_BITS+3)) and RANGE_MASK];}
  628. sub eax, ecx
  629. shr eax, CONST_BITS+PASS1_BITS+3
  630. and eax, RANGE_MASK
  631. mov al, BYTE PTR [ebx+eax]
  632. mov [edi+5], al
  633. {outptr^[3] := range_limit^[ int(DESCALE(tmp13 + tmp0,
  634. CONST_BITS+PASS1_BITS+3)) and RANGE_MASK];}
  635. mov eax, tmp13
  636. add eax, ROUND_CONST_2
  637. mov ecx, tmp0
  638. lea edx, [eax+ecx]
  639. shr edx, CONST_BITS+PASS1_BITS+3
  640. and edx, RANGE_MASK
  641. mov dl, BYTE PTR [ebx+edx]
  642. mov [edi+3], dl
  643. {outptr^[4] := range_limit^[ int(DESCALE(tmp13 - tmp0,
  644. CONST_BITS+PASS1_BITS+3)) and RANGE_MASK];}
  645. sub eax, ecx
  646. shr eax, CONST_BITS+PASS1_BITS+3
  647. and eax, RANGE_MASK
  648. mov al, BYTE PTR [ebx+eax]
  649. mov [edi+4], al
  650. {Inc(int_ptr(wsptr), DCTSIZE); { advance pointer to next row }
  651. add esi, wrkDCTSIZE
  652. add edi, DCTSIZE
  653. {end;}
  654. inc ctr
  655. cmp ctr, DCTSIZE
  656. jl @loop523
  657. @loop524:
  658. @loop496:
  659. pop ebx
  660. pop esi
  661. pop edi
  662. end;
  663. end.