Main Page | Class List | Directories | File List | Class Members | File Members

match.c

Go to the documentation of this file.
00001 /* match.s -- optional optimized asm version of longest match in deflate.c
00002 
00003    Copyright (C) 2002, 2006 Free Software Foundation, Inc.
00004    Copyright (C) 1992-1993 Jean-loup Gailly
00005 
00006    This program is free software; you can redistribute it and/or modify
00007    it under the terms of the GNU General Public License as published by
00008    the Free Software Foundation; either version 2, or (at your option)
00009    any later version.
00010 
00011    This program is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014    GNU General Public License for more details.
00015 
00016    You should have received a copy of the GNU General Public License
00017    along with this program; if not, write to the Free Software Foundation,
00018    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
00019 
00020 /*
00021  * The 68020 version has been written by Francesco Potorti` <pot@cnuce.cnr.it>
00022  * with adaptations by Carsten Steger <stegerc@informatik.tu-muenchen.de>,
00023  * Andreas Schwab <schwab@lamothe.informatik.uni-dortmund.de> and
00024  * Kristoffer Eriksson <ske@pkmab.se>
00025  *
00026  * The ia64 version has been written by Sverre Jarp (HP Labs) 2001-2002.
00027  * Unwind directives and some reformatting for better readability added by
00028  * David Mosberger-Tang <davidm@hpl.hp.com>.
00029  */
00030 
00031 /* $Id: match.c,v 1.1 2006/11/20 08:40:34 eggert Exp $ */
00032 
00033 /* Preprocess with -DNO_UNDERLINE if your C compiler does not prefix
00034  * external symbols with an underline character '_'.
00035  */
00036 #ifdef NO_UNDERLINE
00037 #  define _prev             prev
00038 #  define _window           window
00039 #  define _match_start      match_start
00040 #  define _prev_length      prev_length
00041 #  define _good_match       good_match
00042 #  define _nice_match       nice_match
00043 #  define _strstart         strstart
00044 #  define _max_chain_length max_chain_length
00045 
00046 #  define _match_init       match_init
00047 #  define _longest_match    longest_match
00048 #endif
00049 
00050 #ifdef DYN_ALLOC
00051   error: DYN_ALLOC not yet supported in match.s
00052 #endif
00053 
00054 #if defined(i386) || defined(_I386) || defined(__i386) || defined(__i386__)
00055 
00056 /* This version is for 386 Unix or OS/2 in 32 bit mode.
00057  * Warning: it uses the AT&T syntax: mov source,dest
00058  * This file is only optional. If you want to force the C version,
00059  * add -DNO_ASM to CFLAGS in Makefile and set OBJA to an empty string.
00060  * If you have reduced WSIZE in gzip.h, then change its value below.
00061  * This version assumes static allocation of the arrays (-DDYN_ALLOC not used).
00062  */
00063 
00064         .file   "match.S"
00065 
00066 #define MAX_MATCH       258
00067 #define MAX_MATCH2      $128 /* MAX_MATCH/2-1 */
00068 #define MIN_MATCH       3
00069 #define    WSIZE        $32768
00070 #define MAX_DIST        WSIZE - MAX_MATCH - MIN_MATCH - 1
00071 
00072         .globl  _match_init
00073         .globl  _longest_match
00074 
00075         .text
00076 
00077 _match_init:
00078         ret
00079 
00080 /*-----------------------------------------------------------------------
00081  * Set match_start to the longest match starting at the given string and
00082  * return its length. Matches shorter or equal to prev_length are discarded,
00083  * in which case the result is equal to prev_length and match_start is
00084  * garbage.
00085  * IN assertions: cur_match is the head of the hash chain for the current
00086  *   string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
00087  */
00088 
00089 _longest_match: /* int longest_match(cur_match) */
00090 
00091 #define cur_match   20(%esp)
00092      /* return address */               /* esp+16 */
00093         push    %ebp                    /* esp+12 */
00094         push    %edi                    /* esp+8  */
00095         push    %esi                    /* esp+4  */
00096         push    %ebx                    /* esp    */
00097 
00098 /*
00099  *      match        equ esi
00100  *      scan         equ edi
00101  *      chain_length equ ebp
00102  *      best_len     equ ebx
00103  *      limit        equ edx
00104  */
00105         mov     cur_match,%esi
00106         mov     _max_chain_length,%ebp /* chain_length = max_chain_length */
00107         mov     _strstart,%edi
00108         mov     %edi,%edx
00109         sub     MAX_DIST,%edx          /* limit = strstart-MAX_DIST */
00110         jae     limit_ok
00111         sub     %edx,%edx              /* limit = NIL */
00112 limit_ok:
00113         add     $2+_window,%edi        /* edi = offset(window+strstart+2) */
00114         mov     _prev_length,%ebx      /* best_len = prev_length */
00115         movw    -3(%ebx,%edi),%ax      /* ax = scan[best_len-1..best_len] */
00116         movw    -2(%edi),%cx           /* cx = scan[0..1] */
00117         cmp     _good_match,%ebx       /* do we have a good match already? */
00118         jb      do_scan
00119         shr     $2,%ebp                /* chain_length >>= 2 */
00120         jmp     do_scan
00121 
00122         .align  4
00123 long_loop:
00124 /* at this point, edi == scan+2, esi == cur_match */
00125         movw    -3(%ebx,%edi),%ax       /* ax = scan[best_len-1..best_len] */
00126         movw     -2(%edi),%cx           /* cx = scan[0..1] */
00127 short_loop:
00128 /*
00129  * at this point, di == scan+2, si == cur_match,
00130  * ax = scan[best_len-1..best_len] and cx = scan[0..1]
00131  */
00132         and     WSIZE-1, %esi
00133         movw    _prev(%esi,%esi),%si    /* cur_match = prev[cur_match] */
00134                                         /* top word of esi is still 0 */
00135         cmp     %edx,%esi               /* cur_match <= limit ? */
00136         jbe     the_end
00137         dec     %ebp                    /* --chain_length */
00138         jz      the_end
00139 do_scan:
00140         cmpw    _window-1(%ebx,%esi),%ax/* check match at best_len-1 */
00141         jne     short_loop
00142         cmpw    _window(%esi),%cx       /* check min_match_length match */
00143         jne     short_loop
00144 
00145         lea     _window+2(%esi),%esi    /* si = match */
00146         mov     %edi,%eax               /* ax = scan+2 */
00147         mov     MAX_MATCH2,%ecx         /* scan for at most MAX_MATCH bytes */
00148         rep;    cmpsw                   /* loop until mismatch */
00149         je      maxmatch                /* match of length MAX_MATCH? */
00150 mismatch:
00151         movb    -2(%edi),%cl        /* mismatch on first or second byte? */
00152         subb    -2(%esi),%cl        /* cl = 0 if first bytes equal */
00153         xchg    %edi,%eax           /* edi = scan+2, eax = end of scan */
00154         sub     %edi,%eax           /* eax = len */
00155         sub     %eax,%esi           /* esi = cur_match + 2 + offset(window) */
00156         sub     $2+_window,%esi     /* esi = cur_match */
00157         subb    $1,%cl              /* set carry if cl == 0 (cannot use DEC) */
00158         adc     $0,%eax             /* eax = carry ? len+1 : len */
00159         cmp     %ebx,%eax           /* len > best_len ? */
00160         jle     long_loop
00161         mov     %esi,_match_start       /* match_start = cur_match */
00162         mov     %eax,%ebx               /* ebx = best_len = len */
00163         cmp     _nice_match,%eax        /* len >= nice_match ? */
00164         jl      long_loop
00165 the_end:
00166         mov     %ebx,%eax               /* result = eax = best_len */
00167         pop     %ebx
00168         pop     %esi
00169         pop     %edi
00170         pop     %ebp
00171         ret
00172 maxmatch:
00173         cmpsb
00174         jmp     mismatch
00175 
00176 #else
00177 
00178 /* ======================== 680x0 version ================================= */
00179 
00180 #if defined(m68k)||defined(mc68k)||defined(__mc68000__)||defined(__MC68000__)
00181 #  ifndef mc68000
00182 #    define mc68000
00183 #  endif
00184 #endif
00185 
00186 #if defined(__mc68020__) || defined(__MC68020__) || defined(sysV68)
00187 #  ifndef mc68020
00188 #    define mc68020
00189 #  endif
00190 #endif
00191 
00192 #if defined(mc68020) || defined(mc68000)
00193 
00194 #if (defined(mc68020) || defined(NeXT)) && !defined(UNALIGNED_OK)
00195 #  define UNALIGNED_OK
00196 #endif
00197 
00198 #ifdef sysV68  /* Try Motorola Delta style */
00199 
00200 #  define GLOBAL(symbol)        global  symbol
00201 #  define TEXT                  text
00202 #  define FILE(filename)        file    filename
00203 #  define invert_maybe(src,dst) dst,src
00204 #  define imm(data)             &data
00205 #  define reg(register)         %register
00206 
00207 #  define addl                  add.l
00208 #  define addql                 addq.l
00209 #  define blos                  blo.b
00210 #  define bhis                  bhi.b
00211 #  define bras                  bra.b
00212 #  define clrl                  clr.l
00213 #  define cmpmb                 cmpm.b
00214 #  define cmpw                  cmp.w
00215 #  define cmpl                  cmp.l
00216 #  define lslw                  lsl.w
00217 #  define lsrl                  lsr.l
00218 #  define movel                 move.l
00219 #  define movew                 move.w
00220 #  define moveb                 move.b
00221 #  define moveml                movem.l
00222 #  define subl                  sub.l
00223 #  define subw                  sub.w
00224 #  define subql                 subq.l
00225 
00226 #  define IndBase(bd,An)        (bd,An)
00227 #  define IndBaseNdxl(bd,An,Xn) (bd,An,Xn.l)
00228 #  define IndBaseNdxw(bd,An,Xn) (bd,An,Xn.w)
00229 #  define predec(An)            -(An)
00230 #  define postinc(An)           (An)+
00231 
00232 #else /* default style (Sun 3, NeXT, Amiga, Atari) */
00233 
00234 #  define GLOBAL(symbol)        .globl  symbol
00235 #  define TEXT                  .text
00236 #  define FILE(filename)        .even
00237 #  define invert_maybe(src,dst) src,dst
00238 #  if defined(sun) || defined(mc68k)
00239 #    define imm(data)           #data
00240 #  else
00241 #    define imm(data)           \#data
00242 #  endif
00243 #  define reg(register)         register
00244 
00245 #  define blos                  bcss
00246 #  if defined(sun) || defined(mc68k)
00247 #    define movel               movl
00248 #    define movew               movw
00249 #    define moveb               movb
00250 #  endif
00251 #  define IndBase(bd,An)        An@(bd)
00252 #  define IndBaseNdxl(bd,An,Xn) An@(bd,Xn:l)
00253 #  define IndBaseNdxw(bd,An,Xn) An@(bd,Xn:w)
00254 #  define predec(An)            An@-
00255 #  define postinc(An)           An@+
00256 
00257 #endif  /* styles */
00258 
00259 #define Best_Len        reg(d0)         /* unsigned */
00260 #define Cur_Match       reg(d1)         /* Ipos */
00261 #define Loop_Counter    reg(d2)         /* int */
00262 #define Scan_Start      reg(d3)         /* unsigned short */
00263 #define Scan_End        reg(d4)         /* unsigned short */
00264 #define Limit           reg(d5)         /* IPos */
00265 #define Chain_Length    reg(d6)         /* unsigned */
00266 #define Scan_Test       reg(d7)
00267 #define Scan            reg(a0)         /* *uch */
00268 #define Match           reg(a1)         /* *uch */
00269 #define Prev_Address    reg(a2)         /* *Pos */
00270 #define Scan_Ini        reg(a3)         /* *uch */
00271 #define Match_Ini       reg(a4)         /* *uch */
00272 #define Stack_Pointer   reg(sp)
00273 
00274 #define MAX_MATCH       258
00275 #define MIN_MATCH       3
00276 #define WSIZE           32768
00277 #define MAX_DIST        (WSIZE - MAX_MATCH - MIN_MATCH - 1)
00278 
00279         GLOBAL  (_match_init)
00280         GLOBAL  (_longest_match)
00281 
00282         TEXT
00283 
00284         FILE    ("match.S")
00285 
00286 _match_init:
00287         rts
00288 
00289 /*-----------------------------------------------------------------------
00290  * Set match_start to the longest match starting at the given string and
00291  * return its length. Matches shorter or equal to prev_length are discarded,
00292  * in which case the result is equal to prev_length and match_start is
00293  * garbage.
00294  * IN assertions: cur_match is the head of the hash chain for the current
00295  *   string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
00296  */
00297 
00298 /* int longest_match (cur_match) */
00299 
00300 #ifdef UNALIGNED_OK
00301 #  define pushreg       15928           /* d2-d6/a2-a4 */
00302 #  define popreg        7292
00303 #else
00304 #  define pushreg       16184           /* d2-d7/a2-a4 */
00305 #  define popreg        7420
00306 #endif
00307 
00308 _longest_match:
00309         movel   IndBase(4,Stack_Pointer),Cur_Match
00310         moveml  imm(pushreg),predec(Stack_Pointer)
00311         movel   _max_chain_length,Chain_Length
00312         movel   _prev_length,Best_Len
00313         movel   imm(_prev),Prev_Address
00314         movel   imm(_window+MIN_MATCH),Match_Ini
00315         movel   _strstart,Limit
00316         movel   Match_Ini,Scan_Ini
00317         addl    Limit,Scan_Ini
00318         subw    imm(MAX_DIST),Limit
00319         bhis    L__limit_ok
00320         clrl    Limit
00321 L__limit_ok:
00322         cmpl    invert_maybe(_good_match,Best_Len)
00323         blos    L__length_ok
00324         lsrl    imm(2),Chain_Length
00325 L__length_ok:
00326         subql   imm(1),Chain_Length
00327 #ifdef UNALIGNED_OK
00328         movew   IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
00329         movew   IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
00330 #else
00331         moveb   IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
00332         lslw    imm(8),Scan_Start
00333         moveb   IndBase(-MIN_MATCH+1,Scan_Ini),Scan_Start
00334         moveb   IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
00335         lslw    imm(8),Scan_End
00336         moveb   IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
00337 #endif
00338         bras    L__do_scan
00339 
00340 L__long_loop:
00341 #ifdef UNALIGNED_OK
00342         movew   IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
00343 #else
00344         moveb   IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
00345         lslw    imm(8),Scan_End
00346         moveb   IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
00347 #endif
00348 
00349 L__short_loop:
00350         lslw    imm(1),Cur_Match
00351         movew   IndBaseNdxl(0,Prev_Address,Cur_Match),Cur_Match
00352         cmpw    invert_maybe(Limit,Cur_Match)
00353         dbls    Chain_Length,L__do_scan
00354         bras    L__return
00355 
00356 L__do_scan:
00357         movel   Match_Ini,Match
00358         addl    Cur_Match,Match
00359 #ifdef UNALIGNED_OK
00360         cmpw    invert_maybe(IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_End)
00361         bne     L__short_loop
00362         cmpw    invert_maybe(IndBase(-MIN_MATCH,Match),Scan_Start)
00363         bne     L__short_loop
00364 #else
00365         moveb   IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_Test
00366         lslw    imm(8),Scan_Test
00367         moveb   IndBaseNdxw(-MIN_MATCH,Match,Best_Len),Scan_Test
00368         cmpw    invert_maybe(Scan_Test,Scan_End)
00369         bne     L__short_loop
00370         moveb   IndBase(-MIN_MATCH,Match),Scan_Test
00371         lslw    imm(8),Scan_Test
00372         moveb   IndBase(-MIN_MATCH+1,Match),Scan_Test
00373         cmpw    invert_maybe(Scan_Test,Scan_Start)
00374         bne     L__short_loop
00375 #endif
00376 
00377         movew   imm((MAX_MATCH-MIN_MATCH+1)-1),Loop_Counter
00378         movel   Scan_Ini,Scan
00379 L__scan_loop:
00380         cmpmb   postinc(Match),postinc(Scan)
00381         dbne    Loop_Counter,L__scan_loop
00382 
00383         subl    Scan_Ini,Scan
00384         addql   imm(MIN_MATCH-1),Scan
00385         cmpl    invert_maybe(Best_Len,Scan)
00386         bls     L__short_loop
00387         movel   Scan,Best_Len
00388         movel   Cur_Match,_match_start
00389         cmpl    invert_maybe(_nice_match,Best_Len)
00390         blos    L__long_loop
00391 L__return:
00392         moveml  postinc(Stack_Pointer),imm(popreg)
00393         rts
00394 
00395 #else
00396 
00397 # if defined (__ia64__)
00398 
00399 /* ======================== ia64 version ================================= */
00400 
00401 /*
00402  * 'longest_match.S' (assembly program for gzip for the IA-64 architecture)
00403  *
00404  * Optimised for McKinley, but with Merced-compatibility, such as MIB+MIB, used wherever
00405  * possible.
00406  *
00407  * Copyright: Sverre Jarp (HP Labs) 2001-2002
00408  *
00409  * See deflate.c for c-version
00410  * Version 2 - Optimize the outer loop
00411  */
00412 
00413 #include <endian.h>
00414 
00415 #if __BYTE_ORDER == ____BIG_ENDIAN
00416 #define  first  shl
00417 #define  second shr.u
00418 #define  count  czx1.l
00419 #else
00420 #define  first  shr.u
00421 #define  second shl
00422 #define  count  czx1.r
00423 #endif
00424 
00425 // 24 rotating register (r32 - r55)
00426 
00427 #define s_vmatch0               r32
00428 #define s_vmatch1               r33
00429 #define s_vmatbst               r34
00430 #define s_vmatbst1              r35
00431 #define s_amatblen              r36
00432 
00433 #define s_tm1                   r56
00434 #define s_tm2                   r57
00435 #define s_tm3                   r58
00436 #define s_tm4                   r59
00437 #define s_tm5                   r60
00438 #define s_tm6                   r61
00439 #define s_tm7                   r62
00440 #define s_tm8                   r63
00441 
00442 #define s_vlen                  r31
00443 #define s_vstrstart             r30
00444 #define s_vchainlen             r29
00445 #define s_awinbest              r28
00446 #define s_vcurmatch             r27
00447 #define s_vlimit                r26
00448 #define s_vscanend              r25
00449 #define s_vscanend1             r24
00450 #define s_anicematch            r23
00451 #define s_vscan0                r22
00452 #define s_vscan1                r21
00453 
00454 #define s_aprev                 r20
00455 #define s_awindow               r19
00456 #define s_amatchstart           r18
00457 #define s_ascan                 r17
00458 #define s_amatch                r16
00459 #define s_wmask                 r15
00460 #define s_ascanend              r14
00461 
00462 #define s_vspec_cmatch          r11             // next iteration
00463 #define s_lcsave                r10
00464 #define s_prsave                r9
00465 #define s_vbestlen              r8              // return register
00466 
00467 #define s_vscan3                r3
00468 #define s_vmatch3               r2
00469 
00470 #define p_no                    p2
00471 #define p_yes                   p3
00472 #define p_shf                   p4              //
00473 #define p_bn2                   p5              // Use in loop (indicating bestlen != 2)
00474 
00475 #define p_nbs                   p9              // not new best_len
00476 #define p_nnc                   p10             // not nice_length
00477 #define p_ll                    p11
00478 #define p_end                   p12
00479 
00480 #define MAX_MATCH               258
00481 #define MIN_MATCH                 4
00482 #define WSIZE                 32768
00483 #define MAX_DIST                WSIZE - MAX_MATCH - MIN_MATCH - 1
00484 
00485 #define R_INPUT                 1
00486 #define R_LOCAL                 31
00487 #define R_OUTPUT                0
00488 #define R_ROTATING              24
00489 #define MLAT                    3
00490 #define SHLAT                   2
00491 
00492 #define mova                    mov
00493 #define movi0                   mov
00494 #define cgtu                    cmp.gt.unc
00495 #define cgeu                    cmp.ge.unc
00496 #define cneu                    cmp.ne.unc
00497 
00498         .global longest_match
00499         .proc longest_match
00500         .align 32
00501 longest_match:
00502 // --  Cycle: 0
00503         .prologue
00504 {.mmi
00505          alloc r2=ar.pfs,R_INPUT,R_LOCAL,R_OUTPUT,R_ROTATING
00506         .rotr scan[MLAT+2], match[MLAT+2], shscan0[SHLAT+1], \
00507               shscan1[SHLAT+1], shmatch0[SHLAT+1], shmatch1[SHLAT+1]
00508         .rotp lc[MLAT+SHLAT+2]
00509         mova s_vspec_cmatch=in0 // cur_match from input register
00510         add s_tm1=@gprel(strstart),gp // a(a(strstart))
00511 }{.mmi
00512         add s_tm3=@gprel(prev_length),gp // a(a(prev_length))
00513         add s_tm5=@ltoff(window),gp // a(a(window))
00514         add s_tm6=@ltoff(prev),gp // a(a(prev))
00515         ;;
00516 }{.mmb  //  Cycle: 1
00517         ld4 s_vstrstart=[s_tm1] // strstart
00518         ld4 s_vbestlen=[s_tm3] // best_len = prev_length
00519         brp.loop.imp .cmploop,.cmploop+48
00520 }{.mli
00521         add s_tm2=@gprel(max_chain_length),gp // a(a(max_chain_length))
00522         movl s_wmask=WSIZE-1
00523         ;;
00524 }{.mmi  //  Cycle: 2
00525         ld8 s_aprev=[s_tm6] // a(prev)
00526         ld8 s_awindow=[s_tm5] // a(window)
00527         .save pr, s_prsave
00528         movi0 s_prsave=pr // save predicates
00529 }{.mmi
00530         add s_tm4=@gprel(good_match),gp // a(a(good_match))
00531         add s_tm7=@ltoff(nice_match),gp // a(a(nice_match))
00532         add s_tm8=@ltoff(match_start),gp // a(match_start)
00533         ;;
00534 }{.mmi  //  Cycle: 3
00535         ld8 s_anicematch=[s_tm7] // a(nice_match)
00536         ld8 s_amatchstart=[s_tm8] // a(match_start)
00537         .save ar.lc, s_lcsave
00538         movi0 s_lcsave=ar.lc // save loop count register
00539 }{.mmi
00540         .body
00541         add s_tm1=-(MAX_MATCH + MIN_MATCH),s_wmask // maxdist
00542         cmp.eq p_ll,p0=r0,r0 // parallel compare initialized as 'true'
00543         mova s_vcurmatch=s_vspec_cmatch
00544         ;;
00545 }{.mmi  //  Cycle: 4
00546         ld4 s_vchainlen=[s_tm2] // chain_length=max_chain_length
00547         ld4 s_tm4=[s_tm4] // v(good_match)
00548         add s_ascan=s_awindow,s_vstrstart // scan=window + strstart
00549 }{.mmi
00550         sub s_vlimit=s_vstrstart, s_tm1 // limit=strstart - MAX_DIST
00551         add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match
00552         and s_vspec_cmatch =s_vspec_cmatch,s_wmask
00553         ;;
00554 }{.mmi  //  Cycle: 5
00555         add s_amatblen=s_amatch,s_vbestlen //
00556         cneu p_bn2,p0=2,s_vbestlen // set if bestlen != 2
00557         add s_ascanend=s_ascan,s_vbestlen // compute a(scan) + best_len
00558 }{.mmi
00559         ld1 s_vscan0=[s_ascan],1 // NB: s_ascan++
00560         ld1 s_vmatch0=[s_amatch],1
00561         cgtu p0,p_no=s_vlimit,r0 // is result positive ?
00562         ;;
00563 }{.mmi  //  Cycle: 6
00564         ld1.nt1 s_vscan1=[s_ascan],2 // NB: s_ascan+3 in total
00565         ld1.nt1 s_vmatch1=[s_amatch],2
00566         add s_awinbest=s_awindow,s_vbestlen //
00567         ;;
00568 }{.mmi  //  Cycle: 7
00569         ld1.nt1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len]
00570         ld1.nt1 s_vmatbst=[s_amatblen],-1
00571 (p_no)  mova s_vlimit=r0
00572         ;;
00573 }{.mmi  //  Cycle: 8
00574 (p_bn2) ld1.nt1 s_vscanend1=[s_ascanend],1 // scan_end1=scan[best_len-1]
00575 (p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen]
00576         shladd s_vspec_cmatch =s_vspec_cmatch,1,s_aprev
00577 }{.mmi
00578         cgeu p_shf,p0=s_vbestlen,s_tm4 // is (prev_length >= good_match) ?
00579         ;;
00580 }{.mmi  //  Cycle: 9
00581         ld1.nt1 s_vscan3=[s_ascan]
00582         ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch]
00583         mova    s_vlen=3
00584 }{.mmi
00585 (p_shf) shr.u s_vchainlen=s_vchainlen,2 // (cur_len) >> 2
00586         ;;
00587 }{.mmi  //  Cycle: 10
00588         ld1.nt1 s_vmatch3=[s_amatch]
00589         // p_ll switched on as soon as we get a mismatch:
00590         cmp.eq.and p_ll,p0=s_vmatch0,s_vscan0
00591         cmp.eq.and p_ll,p0=s_vmatbst,s_vscanend
00592 }{.mib
00593         cmp.eq.and p_ll,p0=s_vmatch1,s_vscan1
00594 (p_bn2) cmp.eq.and p_ll,p0=s_vmatbst1,s_vscanend1
00595 (p_ll)  br.cond.dpnt.many .test_more
00596         ;;
00597 }
00598 
00599 .next_iter:
00600 {.mmi   // Cycle 0
00601         add s_amatch=s_awindow,s_vspec_cmatch   // match=window + cur_match
00602         mov s_vcurmatch=s_vspec_cmatch          // current value
00603         add s_vchainlen=-1,s_vchainlen          // --chain_length
00604 }{.mib
00605         cmp.le.unc p_end,p0=s_vspec_cmatch,s_vlimit
00606         and s_vspec_cmatch=s_vspec_cmatch,s_wmask
00607 (p_end) br.cond.dptk.many .terminate
00608         ;;
00609 }{.mmi  // Cycle 1
00610         ld1 s_vmatch0=[s_amatch],1              // load match[0]
00611         // compute prev[cur_match]:
00612         shladd s_vspec_cmatch=s_vspec_cmatch,1,s_aprev
00613         cmp.eq.unc p_end,p0=s_vchainlen,r0
00614 } {.mib
00615         nop.m 0
00616         add s_amatblen=s_awinbest,s_vcurmatch   // match=window + cur_match
00617 (p_end) br.cond.dptk.many .terminate
00618         ;;
00619 }{.mmi  // Cycle 2 (short)
00620         ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch]         // get next cur_match
00621         ;;
00622 }{.mmi  // Cycle 3 (short)
00623         ld1.nt1 s_vmatbst=[s_amatblen],-1       // load match[best_len]
00624         cmp.ne.unc p_ll,p0=r0,r0     // parallel compare initialized as 'false'
00625         ;;
00626 }{.mmi  // Cycle 4 (short)
00627         // load match[1] - - note: match += 3 (in total):
00628         ld1.nt1 s_vmatch1=[s_amatch],2
00629         ;;
00630         // Cycle 5  (short)
00631 (p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen]         // load match[best_len-1]
00632 }{.mib  // Here we (MOST LIKELY) pay a L2-fetch stall
00633         // p_ll switched on as soon as we get a mismatch:
00634         cmp.ne.or p_ll,p0=s_vmatch0,s_vscan0
00635         cmp.ne.or p_ll,p0=s_vmatbst,s_vscanend
00636 (p_ll)  br.cond.dptk.many .next_iter
00637         ;;
00638 }{.mmi  // Cycle 6
00639         ld1.nt1 s_vmatch3=[s_amatch]
00640         mova s_vlen=3
00641         nop.i 0
00642 }{.mib
00643         cmp.ne.or p_ll,p0=s_vmatch1,s_vscan1
00644 (p_bn2) cmp.ne.or p_ll,p0=s_vmatbst1,s_vscanend1
00645 (p_ll)  br.cond.dptk.many .next_iter
00646         ;;
00647 }
00648 
00649 // We have passed the first hurdle - Are there additional matches ???
00650 
00651 .test_more:
00652 {.mmi   // Cycle 0
00653         and s_tm3=7,s_ascan                     // get byte offset
00654         and s_tm4=7,s_amatch                    // get byte offset
00655         movi0 ar.ec=MLAT+SHLAT+2                // NB: One trip more than usual
00656 }{.mib
00657         cmp.ne.unc p_no,p0=s_vscan3,s_vmatch3   // does not next one differ?
00658 (p_no)  br.cond.dptk.many .only3
00659         ;;
00660 }{.mmi  // Cycle 1
00661         and s_tm1=-8,s_ascan    // get aligned address
00662         shladd s_tm3=s_tm3,3,r0
00663         movi0 ar.lc=31          // 32 times around the loop (8B at a time)
00664 }{.mib
00665         and s_tm2=-8,s_amatch                   // get aligned address
00666         shladd s_tm4=s_tm4,3,r0
00667         nop.b 0
00668         ;;
00669 }{.mmi  // Cycle 2
00670         ld8.nt1 scan[1]=[s_tm1],8                       // load first chunk
00671         sub s_tm5=64,s_tm3                              // 64 - amount
00672         movi0 pr.rot=1<<16
00673 }{.mmi
00674         ld8.nt1 match[1]=[s_tm2],8      // load first chunk
00675         sub s_tm6=64,s_tm4              // 64 - amount
00676         add s_vlen=-8,s_vlen            // will be updated at least once
00677         ;;
00678 }
00679         .align 32
00680 .cmploop:
00681 {.mmi   // Cycle 0
00682 (lc[0])                 ld8 scan[0]=[s_tm1],8           // next scan chunk
00683 (lc[MLAT+SHLAT+1])      add s_vlen=8,s_vlen
00684 (lc[MLAT])              first shscan0[0]=scan[MLAT+1],s_tm3
00685 }{.mib
00686 (lc[MLAT+SHLAT+1])      cmp.ne.unc p_no,p0=s_tm7,s_tm8  // break search if !=
00687 (lc[MLAT])              first shmatch0[0]=match[MLAT+1],s_tm4
00688 (p_no)                  br.cond.dpnt.many .mismatch
00689                         ;;
00690 }{.mii  // Cycle 1
00691 (lc[0])                 ld8 match[0]=[s_tm2],8
00692                         // shift left(le) or right(be):
00693 (lc[MLAT])              second shscan1[0]=scan[MLAT],s_tm5
00694 (lc[MLAT])              second shmatch1[0]=match[MLAT],s_tm6
00695 }{.mmb
00696 (lc[MLAT+SHLAT])        or s_tm7=shscan0[SHLAT],shscan1[SHLAT]
00697 (lc[MLAT+SHLAT])        or s_tm8=shmatch0[SHLAT],shmatch1[SHLAT]
00698                         br.ctop.dptk.many .cmploop
00699                         ;;
00700 }{.mfi
00701         mov s_vlen=258
00702         nop.f 0
00703 }{.mfi
00704         nop.f 0    // realign
00705         ;;
00706 }
00707 .mismatch:
00708 {.mii   // Cycle 0 (short)
00709 (p_no)  pcmp1.eq s_tm2=s_tm7,s_tm8      // find first non-matching character
00710         nop.i 0
00711         ;;
00712         // Cycle 1 (short)
00713 (p_no)  count s_tm1=s_tm2
00714         ;;
00715 }{.mib  // Cycle 2 (short)
00716 (p_no)  add s_vlen=s_vlen,s_tm1         // effective length
00717         nop.i 0
00718         clrrrb
00719         ;;
00720 }
00721 
00722 .only3:
00723 {.mib   // Cycle 0 (short)
00724         cmp.gt.unc p0,p_nbs=s_vlen,s_vbestlen           // (len > best_len) ?
00725 (p_nbs) br.cond.dpnt.many .next_iter                    // if not, re-iternate
00726         ;;
00727 }{.mmi  // Cycle 1 (short)
00728         ld4 s_tm7=[s_anicematch]                        // nice_match
00729         st4 [s_amatchstart]= s_vcurmatch
00730         add s_ascanend=s_ascan,s_vlen                   // reset with best_len
00731         ;;
00732 }{.mmi  // Cycle 2 (short)
00733         mova s_vbestlen=s_vlen
00734         add s_ascanend=-3,s_ascanend            // remember extra offset
00735         ;;
00736 }{.mmi  // Cycle 3 (short)
00737         ld1 s_vscanend=[s_ascanend],-1          // scan_end=scan[best_len]
00738         add s_awinbest=s_awindow,s_vbestlen     // update with new best_len
00739         cmp.ne.unc p_bn2,p0=2,s_vbestlen        // set if bestlen != 2
00740         ;;
00741 }{.mib  // Cycle 4 (short)
00742         // scan_end1=scan[best_len-1] NB: s_ascanend reset:
00743         ld1.nt1 s_vscanend1=[s_ascanend],1
00744         cmp.lt.unc p_nnc,p0=s_vlen,s_tm7        // compare with nice_match
00745 (p_nnc) br.cond.dptk.many .next_iter
00746         ;;
00747 }
00748 .terminate:
00749 {.mii   // Cycle 0/1
00750         nop.m 0
00751         movi0 ar.lc=s_lcsave
00752         movi0 pr=s_prsave,-1
00753 }{.mbb
00754         nop.m 0
00755         nop.b 0
00756         br.ret.sptk.many rp     // ret0 is identical to best_len
00757         ;;
00758 }
00759         .endp
00760 
00761         .global match_init
00762         .proc match_init
00763 match_init:
00764         sub ret0=ret0,ret0
00765         br.ret.sptk.many rp
00766         .endp
00767 
00768 # else
00769  error: this asm version is for 386 or 680x0 or ia64 only
00770 # endif /* __ia64__ */
00771 #endif /* mc68000 || mc68020 */
00772 #endif /* i386 || _I386   */

© sourcejam.com 2005-2008