diff --git a/go.mod b/go.mod index eb92a1b0..3f507ad5 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.16 require ( github.com/golang/protobuf v1.4.2 // indirect - github.com/klauspost/compress v1.11.12 + github.com/klauspost/compress v1.13.4 github.com/minio/highwayhash v1.0.1 github.com/nats-io/jwt/v2 v2.0.3 github.com/nats-io/nats.go v1.11.1-0.20210819195927-9053aa4200f0 diff --git a/go.sum b/go.sum index ca43c6c6..65f83fd4 100644 --- a/go.sum +++ b/go.sum @@ -5,11 +5,14 @@ github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:W github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0= github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/klauspost/compress v1.11.12 h1:famVnQVu7QwryBN4jNseQdUKES71ZAOnB6UQQJPZvqk= github.com/klauspost/compress v1.11.12/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs= +github.com/klauspost/compress v1.13.4 h1:0zhec2I8zGnjWcKyLl6i3gPqKANCCn5e9xmviEEeX6s= +github.com/klauspost/compress v1.13.4/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg= github.com/minio/highwayhash v1.0.1 h1:dZ6IIu8Z14VlC0VpfKofAhCy74wu/Qb5gcn52yWoz/0= github.com/minio/highwayhash v1.0.1/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLTk+kldvVxY= github.com/nats-io/jwt v1.2.2 h1:w3GMTO969dFg+UOKTmmyuu7IGdusK+7Ytlt//OYH/uU= diff --git a/server/events.go b/server/events.go index fe082bc8..54844ae6 100644 --- a/server/events.go +++ b/server/events.go @@ -15,6 +15,7 @@ package server import ( "bytes" + "compress/gzip" "encoding/json" "errors" "fmt" @@ -26,6 +27,7 @@ import ( "sync/atomic" "time" + "github.com/klauspost/compress/s2" "github.com/nats-io/jwt/v2" "github.com/nats-io/nats-server/v2/server/pse" ) @@ -238,6 +240,7 @@ type pubMsg struct { rply string si *ServerInfo msg interface{} + oct compressionType last bool } @@ -331,16 +334,44 @@ RESET: c.mu.Lock() // Prep internal structures needed to send message. - c.pa.subject = []byte(pm.sub) - c.pa.size = len(b) - c.pa.szb = []byte(strconv.FormatInt(int64(len(b)), 10)) - c.pa.reply = []byte(pm.rply) + c.pa.subject, c.pa.reply = []byte(pm.sub), []byte(pm.rply) + c.pa.size, c.pa.szb = len(b), []byte(strconv.FormatInt(int64(len(b)), 10)) + c.pa.hdr, c.pa.hdb = -1, nil trace := c.trace + + // Now check for optional compression. + var contentHeader string + var bb bytes.Buffer + + if len(b) > 0 { + switch pm.oct { + case gzipCompression: + zw := gzip.NewWriter(&bb) + zw.Write(b) + zw.Close() + b = bb.Bytes() + contentHeader = "gzip" + case snappyCompression: + sw := s2.NewWriter(&bb, s2.WriterSnappyCompat()) + sw.Write(b) + sw.Close() + b = bb.Bytes() + contentHeader = "snappy" + case unsupportedCompression: + b = c.setHeader(contentEncodingHeader, "identity", b) + contentHeader = "identity" + } + } c.mu.Unlock() // Add in NL b = append(b, _CRLF_...) + // Check if we should set content-encoding + if contentHeader != _EMPTY_ { + b = c.setHeader(contentEncodingHeader, contentHeader, b) + } + if trace { c.traceInOp(fmt.Sprintf("PUB %s %s %d", c.pa.subject, c.pa.reply, c.pa.size), nil) c.traceMsg(b) @@ -383,7 +414,7 @@ func (s *Server) sendShutdownEvent() { s.mu.Unlock() // Send to the internal queue and mark as last. si := &ServerInfo{} - sendq <- &pubMsg{nil, subj, _EMPTY_, si, si, true} + sendq <- &pubMsg{nil, subj, _EMPTY_, si, si, noCompression, true} } // Used to send an internal message to an arbitrary account. @@ -405,33 +436,46 @@ func (s *Server) sendInternalAccountMsg(a *Account, subject string, msg interfac a.mu.Unlock() } - sendq <- &pubMsg{c, subject, _EMPTY_, nil, msg, false} + sendq <- &pubMsg{c, subject, _EMPTY_, nil, msg, noCompression, false} return nil } // This will queue up a message to be sent. // Lock should not be held. -func (s *Server) sendInternalMsgLocked(sub, rply string, si *ServerInfo, msg interface{}) { +func (s *Server) sendInternalMsgLocked(subj, rply string, si *ServerInfo, msg interface{}) { s.mu.Lock() - s.sendInternalMsg(sub, rply, si, msg) + s.sendInternalMsg(subj, rply, si, msg) s.mu.Unlock() } // This will queue up a message to be sent. // Assumes lock is held on entry. -func (s *Server) sendInternalMsg(sub, rply string, si *ServerInfo, msg interface{}) { +func (s *Server) sendInternalMsg(subj, rply string, si *ServerInfo, msg interface{}) { if s.sys == nil || s.sys.sendq == nil { return } sendq := s.sys.sendq // Don't hold lock while placing on the channel. s.mu.Unlock() - sendq <- &pubMsg{nil, sub, rply, si, msg, false} + sendq <- &pubMsg{nil, subj, rply, si, msg, noCompression, false} s.mu.Lock() } +// Will send an api response. +func (s *Server) sendInternalResponse(subj string, response *ServerAPIResponse) { + s.mu.Lock() + if s.sys == nil || s.sys.sendq == nil { + s.mu.Unlock() + return + } + sendq := s.sys.sendq + // Don't hold lock while placing on the channel. + s.mu.Unlock() + sendq <- &pubMsg{nil, subj, _EMPTY_, response.Server, response, response.compress, false} +} + // Used to send internal messages from other system clients to avoid no echo issues. -func (c *client) sendInternalMsg(sub, rply string, si *ServerInfo, msg interface{}) { +func (c *client) sendInternalMsg(subj, rply string, si *ServerInfo, msg interface{}) { if c == nil { return } @@ -447,7 +491,7 @@ func (c *client) sendInternalMsg(sub, rply string, si *ServerInfo, msg interface // Don't hold lock while placing on the channel. s.mu.Unlock() - sendq <- &pubMsg{c, sub, rply, si, msg, false} + sendq <- &pubMsg{c, subj, rply, si, msg, noCompression, false} } // Locked version of checking if events system running. Also checks server. @@ -1266,6 +1310,35 @@ func (s *Server) filterRequest(fOpts *EventFilterOptions) bool { return false } +// Encoding support (compression) +type compressionType int8 + +const ( + noCompression = compressionType(iota) + gzipCompression + snappyCompression + unsupportedCompression +) + +// ServerAPIResponse is the response type for the server API like varz, connz etc. +type ServerAPIResponse struct { + Server *ServerInfo `json:"server"` + Data interface{} `json:"data,omitempty"` + Error *ApiError `json:"error,omitempty"` + + // Private to indicate compression if any. + compress compressionType +} + +// Specialized response types for unmarshalling. + +// ServerAPIConnzResponse is the response type connz +type ServerAPIConnzResponse struct { + Server *ServerInfo `json:"server"` + Data *Connz `json:"data,omitempty"` + Error *ApiError `json:"error,omitempty"` +} + // statszReq is a request for us to respond with current statsz. func (s *Server) statszReq(sub *subscription, _ *client, _ *Account, subject, reply string, msg []byte) { if !s.EventsEnabled() || reply == _EMPTY_ { @@ -1274,13 +1347,11 @@ func (s *Server) statszReq(sub *subscription, _ *client, _ *Account, subject, re opts := StatszEventOptions{} if len(msg) != 0 { if err := json.Unmarshal(msg, &opts); err != nil { - server := &ServerInfo{} - response := map[string]interface{}{"server": server} - response["error"] = map[string]interface{}{ - "code": http.StatusBadRequest, - "description": err.Error(), + response := &ServerAPIResponse{ + Server: &ServerInfo{}, + Error: &ApiError{Code: http.StatusBadRequest, Description: err.Error()}, } - s.sendInternalMsgLocked(reply, _EMPTY_, server, response) + s.sendInternalMsgLocked(reply, _EMPTY_, response.Server, response) return } else if ignore := s.filterRequest(&opts.EventFilterOptions); ignore { return @@ -1293,15 +1364,34 @@ func (s *Server) statszReq(sub *subscription, _ *client, _ *Account, subject, re var errSkipZreq = errors.New("filtered response") +const ( + acceptEncodingHeader = "Accept-Encoding" + contentEncodingHeader = "Content-Encoding" +) + +// This is not as formal as it could be. We see if anything has s2 or snappy first, then gzip. +func getAcceptEncoding(hdr []byte) compressionType { + ae := strings.ToLower(string(getHeader(acceptEncodingHeader, hdr))) + if ae == _EMPTY_ { + return noCompression + } + if strings.Contains(ae, "snappy") || strings.Contains(ae, "s2") { + return snappyCompression + } + if strings.Contains(ae, "gzip") { + return gzipCompression + } + return unsupportedCompression +} + func (s *Server) zReq(c *client, reply string, rmsg []byte, fOpts *EventFilterOptions, optz interface{}, respf func() (interface{}, error)) { if !s.EventsEnabled() || reply == _EMPTY_ { return } - server := &ServerInfo{} - response := map[string]interface{}{"server": server} + response := &ServerAPIResponse{Server: &ServerInfo{}} var err error status := 0 - _, msg := c.msgParts(rmsg) + hdr, msg := c.msgParts(rmsg) if len(msg) != 0 { if err = json.Unmarshal(msg, optz); err != nil { status = http.StatusBadRequest // status is only included on error, so record how far execution got @@ -1310,19 +1400,19 @@ func (s *Server) zReq(c *client, reply string, rmsg []byte, fOpts *EventFilterOp } } if err == nil { - response["data"], err = respf() + response.Data, err = respf() if errors.Is(err, errSkipZreq) { return + } else if err != nil { + status = http.StatusInternalServerError } - status = http.StatusInternalServerError } if err != nil { - response["error"] = map[string]interface{}{ - "code": status, - "description": err.Error(), - } + response.Error = &ApiError{Code: status, Description: err.Error()} + } else if len(hdr) > 0 { + response.compress = getAcceptEncoding(hdr) } - s.sendInternalMsgLocked(reply, _EMPTY_, server, response) + s.sendInternalResponse(reply, response) } // remoteConnsUpdate gets called when we receive a remote update from another server. @@ -1472,7 +1562,7 @@ func (s *Server) sendAccConnsUpdate(a *Account, subj ...string) { } } for _, sub := range subj { - msg := &pubMsg{nil, sub, _EMPTY_, &m.Server, &m, false} + msg := &pubMsg{nil, sub, _EMPTY_, &m.Server, &m, noCompression, false} select { case sendQ <- msg: default: diff --git a/server/jetstream.go b/server/jetstream.go index 6e47fe6e..ede44de0 100644 --- a/server/jetstream.go +++ b/server/jetstream.go @@ -1497,7 +1497,7 @@ func (jsa *jsAccount) sendClusterUsageUpdate() { le.PutUint64(b[16:], uint64(jsa.usage.api)) le.PutUint64(b[24:], uint64(jsa.usage.err)) if jsa.sendq != nil { - jsa.sendq <- &pubMsg{nil, jsa.updatesPub, _EMPTY_, nil, b, false} + jsa.sendq <- &pubMsg{nil, jsa.updatesPub, _EMPTY_, nil, b, noCompression, false} } } diff --git a/server/norace_test.go b/server/norace_test.go index 4f6aa304..0295f013 100644 --- a/server/norace_test.go +++ b/server/norace_test.go @@ -17,8 +17,12 @@ package server import ( "bufio" + "bytes" + "compress/gzip" "encoding/json" "fmt" + "io" + "io/ioutil" "math/rand" "net" "net/url" @@ -30,6 +34,7 @@ import ( "testing" "time" + "github.com/klauspost/compress/s2" "github.com/nats-io/jwt/v2" "github.com/nats-io/nats.go" "github.com/nats-io/nkeys" @@ -2682,6 +2687,65 @@ func TestNoRaceAccountConnz(t *testing.T) { doFiltered("two", "foo.>", 20) } +func TestNoRaceCompressedConnz(t *testing.T) { + s := RunBasicJetStreamServer() + defer s.Shutdown() + + config := s.JetStreamConfig() + if config != nil { + defer removeDir(t, config.StoreDir) + } + + nc, _ := jsClientConnect(t, s) + defer nc.Close() + + doRequest := func(compress string) { + t.Helper() + m := nats.NewMsg("$SYS.REQ.ACCOUNT.PING.CONNZ") + m.Header.Add("Accept-Encoding", compress) + resp, err := nc.RequestMsg(m, time.Second) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + buf := resp.Data + + // Make sure we have an encoding header. + ce := resp.Header.Get("Content-Encoding") + switch strings.ToLower(ce) { + case "gzip": + zr, err := gzip.NewReader(bytes.NewReader(buf)) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + defer zr.Close() + buf, err = ioutil.ReadAll(zr) + if err != nil && err != io.ErrUnexpectedEOF { + t.Fatalf("Unexpected error: %v", err) + } + case "snappy", "s2": + sr := s2.NewReader(bytes.NewReader(buf)) + buf, err = ioutil.ReadAll(sr) + if err != nil && err != io.ErrUnexpectedEOF { + t.Fatalf("Unexpected error: %v", err) + } + default: + t.Fatalf("Unknown content-encoding of %q", ce) + } + + var cz ServerAPIConnzResponse + if err := json.Unmarshal(buf, &cz); err != nil { + t.Fatalf("Unexpected error: %v", err) + } + if cz.Error != nil { + t.Fatalf("Unexpected error: %+v", cz.Error) + } + } + + doRequest("gzip") + doRequest("snappy") + doRequest("s2") +} + func TestNoRaceJetStreamClusterExtendedStreamPurge(t *testing.T) { for _, st := range []StorageType{FileStorage, MemoryStorage} { t.Run(st.String(), func(t *testing.T) { diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md index fc74706b..81fad652 100644 --- a/vendor/github.com/klauspost/compress/s2/README.md +++ b/vendor/github.com/klauspost/compress/s2/README.md @@ -7,26 +7,30 @@ S2 is aimed for high throughput, which is why it features concurrent compression Decoding is compatible with Snappy compressed content, but content compressed with S2 cannot be decompressed by Snappy. This means that S2 can seamlessly replace Snappy without converting compressed content. +S2 can produce Snappy compatible output, faster and better than Snappy. +If you want full benefit of the changes you should use s2 without Snappy compatibility. + S2 is designed to have high throughput on content that cannot be compressed. This is important, so you don't have to worry about spending CPU cycles on already compressed data. ## Benefits over Snappy * Better compression +* Adjustable compression (3 levels) * Concurrent stream compression -* Faster decompression +* Faster decompression, even for Snappy compatible content * Ability to quickly skip forward in compressed stream * Compatible with reading Snappy compressed content -* Offers alternative, more efficient, but slightly slower compression mode. -* Smaller block size overhead on incompressible blocks. +* Smaller block size overhead on incompressible blocks * Block concatenation -* Automatic stream size padding. -* Snappy compatible block compression. +* Uncompressed stream mode +* Automatic stream size padding +* Snappy compatible block compression ## Drawbacks over Snappy * Not optimized for 32 bit systems. -* Uses slightly more memory (4MB per core) due to larger blocks and concurrency (configurable). +* Streams use slightly more memory due to larger blocks and concurrency (configurable). # Usage @@ -150,36 +154,43 @@ To build binaries to the current folder use: Usage: s2c [options] file1 file2 Compresses all files supplied as input separately. -Output files are written as 'filename.ext.s2'. +Output files are written as 'filename.ext.s2' or 'filename.ext.snappy'. By default output files will be overwritten. Use - as the only file name to read from stdin and write to stdout. Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt +File names beginning with 'http://' and 'https://' will be downloaded and compressed. +Only http response code 200 is accepted. + Options: -bench int - Run benchmark n times. No output will be written + Run benchmark n times. No output will be written -blocksize string - Max block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB (default "4M") - -c Write all output to stdout. Multiple input files will be concatenated + Max block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB (default "4M") + -c Write all output to stdout. Multiple input files will be concatenated -cpu int - Compress using this amount of threads (default 32) + Compress using this amount of threads (default 32) -faster - Compress faster, but with a minor compression loss + Compress faster, but with a minor compression loss -help - Display help + Display help + -o string + Write output to another file. Single input file only -pad string - Pad size to a multiple of this value, Examples: 500, 64K, 256K, 1M, 4M, etc (default "1") - -q Don't write any output to terminal, except errors + Pad size to a multiple of this value, Examples: 500, 64K, 256K, 1M, 4M, etc (default "1") + -q Don't write any output to terminal, except errors -rm - Delete source file(s) after successful compression + Delete source file(s) after successful compression -safe - Do not overwrite output files + Do not overwrite output files -slower - Compress more, but a lot slower + Compress more, but a lot slower + -snappy + Generate Snappy compatible output stream -verify - Verify written files + Verify written files ``` @@ -195,19 +206,24 @@ Use - as the only file name to read from stdin and write to stdout. Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt +File names beginning with 'http://' and 'https://' will be downloaded and decompressed. +Extensions on downloaded files are ignored. Only http response code 200 is accepted. + Options: -bench int - Run benchmark n times. No output will be written - -c Write all output to stdout. Multiple input files will be concatenated + Run benchmark n times. No output will be written + -c Write all output to stdout. Multiple input files will be concatenated -help - Display help - -q Don't write any output to terminal, except errors + Display help + -o string + Write output to another file. Single input file only + -q Don't write any output to terminal, except errors -rm - Delete source file(s) after successful decompression + Delete source file(s) after successful decompression -safe - Do not overwrite output files + Do not overwrite output files -verify - Verify files, but do not write output + Verify files, but do not write output ``` ## s2sx: self-extracting archives @@ -224,7 +240,8 @@ Usage: s2sx [options] file1 file2 Compresses all files supplied as input separately. If files have '.s2' extension they are assumed to be compressed already. -Output files are written as 'filename.s2sfx' and with '.exe' for windows targets. +Output files are written as 'filename.s2sx' and with '.exe' for windows targets. +If output is big, an additional file with ".more" is written. This must be included as well. By default output files will be overwritten. Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt @@ -238,6 +255,8 @@ Options: Compress using this amount of threads (default 32) -help Display help + -max string + Maximum executable size. Rest will be written to another file. (default "1G") -os string Destination operating system (default "windows") -q Don't write any output to terminal, except errors @@ -261,6 +280,17 @@ Available platforms are: * windows-386 * windows-amd64 +By default, there is a size limit of 1GB for the output executable. + +When this is exceeded the remaining file content is written to a file called +output+`.more`. This file must be included for a successful extraction and +placed alongside the executable for a successful extraction. + +This file *must* have the same name as the executable, so if the executable is renamed, +so must the `.more` file. + +This functionality is disabled with stdin/stdout. + ### Self-extracting TAR files If you wrap a TAR file you can specify `-untar` to make it untar on the destination host. @@ -390,21 +420,34 @@ In rare, worst case scenario Snappy blocks could be significantly bigger than th ### Mixed content blocks The most reliable is a wide dataset. -For this we use `webdevdata.org-2015-01-07-subset`, 53927 files, total input size: 4,014,526,923 bytes. -Single goroutine used. +For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z), +53927 files, total input size: 4,014,735,833 bytes. Single goroutine used. | * | Input | Output | Reduction | MB/s | |-------------------|------------|------------|-----------|--------| -| S2 | 4014526923 | 1062282489 | 73.54% | **861.44** | -| S2 Better | 4014526923 | 981221284 | **75.56%** | 399.54 | -| Snappy | 4014526923 | 1128667736 | 71.89% | 741.29 | -| S2, Snappy Output | 4014526923 | 1093784815 | 72.75% | 843.66 | +| S2 | 4014735833 | 1059723369 | 73.60% | **934.34** | +| S2 Better | 4014735833 | 969670507 | 75.85% | 532.70 | +| S2 Best | 4014735833 | 906625668 | **77.85%** | 46.84 | +| Snappy | 4014735833 | 1128706759 | 71.89% | 762.59 | +| S2, Snappy Output | 4014735833 | 1093821420 | 72.75% | 908.60 | +| LZ4 | 4014735833 | 1079259294 | 73.12% | 526.94 | -S2 delivers both the best single threaded throuhput with regular mode and the best compression rate with "better" mode. +S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best". +"Better" mode provides the same compression speed as LZ4 with better compression ratio. -When outputting Snappy compatible output it still delivers better throughput (100MB/s more) and better compression. +When outputting Snappy compatible output it still delivers better throughput (150MB/s more) and better compression. -As can be seen from the other benchmarks decompression should also be easier on the S2 generated output. +As can be seen from the other benchmarks decompression should also be easier on the S2 generated output. + +Though they cannot be compared due to different decompression speeds here are the speed/size comparisons for +other Go compressors: + +| * | Input | Output | Reduction | MB/s | +|-------------------|------------|------------|-----------|--------| +| Zstd Fastest (Go) | 4014735833 | 794608518 | 80.21% | 236.04 | +| Zstd Best (Go) | 4014735833 | 704603356 | 82.45% | 35.63 | +| Deflate (Go) l1 | 4014735833 | 871294239 | 78.30% | 214.04 | +| Deflate (Go) l9 | 4014735833 | 730389060 | 81.81% | 41.17 | ### Standard block compression @@ -537,31 +580,97 @@ Some examples compared on 16 core CPU, amd64 assembly used: * enwik10 Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s Better... 10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s -Best... 10000000000 -> 3667646858 [36.68%]; 35.995s, 264.9MB/s +Best... 10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s * github-june-2days-2019.json Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s Better... 6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s -Best... 6273951764 -> 846260870 [13.49%]; 8.125s, 736.4MB/s +Best... 6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s * nyc-taxi-data-10M.csv Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s Better... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s -Best... 3325605752 -> 794873295 [23.90%]; 6.619s, 479.1MB/s +Best... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s * 10gb.tar Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s Better... 10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s -Best... 10065157632 -> 5246578570 [52.13%]; 25.696s, 373.6MB/s +Best... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/ * consensus.db.10gb Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s Better... 10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s -Best... 10737418240 -> 4272335558 [39.79%]; 38.955s, 262.9MB/s +Best... 10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s ``` Decompression speed should be around the same as using the 'better' compression mode. +# Snappy Compatibility + +S2 now offers full compatibility with Snappy. + +This means that the efficient encoders of S2 can be used to generate fully Snappy compatible output. + +There is a [snappy](https://github.com/klauspost/compress/tree/master/snappy) package that can be used by +simply changing imports from `github.com/golang/snappy` to `github.com/klauspost/compress/snappy`. +This uses "better" mode for all operations. +If you would like more control, you can use the s2 package as described below: + +## Blocks + +Snappy compatible blocks can be generated with the S2 encoder. +Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace + +| Snappy | S2 replacement | +|----------------------------|-------------------------| +| snappy.Encode(...) | s2.EncodeSnappy(...) | +| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) | + +`s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output. + +`s2.ConcatBlocks` is compatible with snappy blocks. + +Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z), +53927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used: + +| Encoder | Size | MB/s | Reduction | +|-----------------------|------------|--------|------------ +| snappy.Encode | 1128706759 | 725.59 | 71.89% | +| s2.EncodeSnappy | 1093823291 | 899.16 | 72.75% | +| s2.EncodeSnappyBetter | 1001158548 | 578.49 | 75.06% | +| s2.EncodeSnappyBest | 944507998 | 66.00 | 76.47% | + +## Streams + +For streams, replace `enc = snappy.NewBufferedWriter(w)` with `enc = s2.NewWriter(w, s2.WriterSnappyCompat())`. +All other options are available, but note that block size limit is different for snappy. + +Comparison of different streams, AMD Ryzen 3950x, 16 cores. Size and throughput: + +| File | snappy.NewWriter | S2 Snappy | S2 Snappy, Better | S2 Snappy, Best | +|-----------------------------|--------------------------|---------------------------|--------------------------|-------------------------| +| nyc-taxi-data-10M.csv | 1316042016 - 517.54MB/s | 1307003093 - 8406.29MB/s | 1174534014 - 4984.35MB/s | 1115904679 - 177.81MB/s | +| enwik10 | 5088294643 - 433.45MB/s | 5175840939 - 8454.52MB/s | 4560784526 - 4403.10MB/s | 4340299103 - 159.71MB/s | +| 10gb.tar | 6056946612 - 703.25MB/s | 6208571995 - 9035.75MB/s | 5741646126 - 2402.08MB/s | 5548973895 - 171.17MB/s | +| github-june-2days-2019.json | 1525176492 - 908.11MB/s | 1476519054 - 12625.93MB/s | 1400547532 - 6163.61MB/s | 1321887137 - 200.71MB/s | +| consensus.db.10gb | 5412897703 - 1054.38MB/s | 5354073487 - 12634.82MB/s | 5335069899 - 2472.23MB/s | 5201000954 - 166.32MB/s | + +# Decompression + +All decompression functions map directly to equivalent s2 functions. + +| Snappy | S2 replacement | +|------------------------|--------------------| +| snappy.Decode(...) | s2.Decode(...) | +| snappy.DecodedLen(...) | s2.DecodedLen(...) | +| snappy.NewReader(...) | s2.NewReader(...) | + +Features like [quick forward skipping without decompression](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.Skip) +are also available for Snappy streams. + +If you know you are only decompressing snappy streams, setting [`ReaderMaxBlockSize(64<<10)`](https://pkg.go.dev/github.com/klauspost/compress/s2#ReaderMaxBlockSize) +on your Reader will reduce memory consumption. + # Concatenating blocks and streams. Concatenating streams will concatenate the output of both without recompressing them. diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go index 8e65a875..d0ae5304 100644 --- a/vendor/github.com/klauspost/compress/s2/decode.go +++ b/vendor/github.com/klauspost/compress/s2/decode.go @@ -144,12 +144,13 @@ type Reader struct { // maximum expected buffer size. maxBufSize int // alloc a buffer this size if > 0. - lazyBuf int - readHeader bool - paramsOK bool + lazyBuf int + readHeader bool + paramsOK bool + snappyFrame bool } -// ensureBufferSize will ensure that the buffe can take at least n bytes. +// ensureBufferSize will ensure that the buffer can take at least n bytes. // If false is returned the buffer exceeds maximum allowed size. func (r *Reader) ensureBufferSize(n int) bool { if len(r.buf) >= n { @@ -252,7 +253,9 @@ func (r *Reader) Read(p []byte) (int, error) { return 0, r.err } if !r.ensureBufferSize(chunkLen) { - r.err = ErrUnsupported + if r.err == nil { + r.err = ErrUnsupported + } return 0, r.err } buf := r.buf[:chunkLen] @@ -267,6 +270,11 @@ func (r *Reader) Read(p []byte) (int, error) { r.err = err return 0, r.err } + if r.snappyFrame && n > maxSnappyBlockSize { + r.err = ErrCorrupt + return 0, r.err + } + if n > len(r.decoded) { if n > r.maxBlock { r.err = ErrCorrupt @@ -292,7 +300,9 @@ func (r *Reader) Read(p []byte) (int, error) { return 0, r.err } if !r.ensureBufferSize(chunkLen) { - r.err = ErrUnsupported + if r.err == nil { + r.err = ErrUnsupported + } return 0, r.err } buf := r.buf[:checksumSize] @@ -302,6 +312,10 @@ func (r *Reader) Read(p []byte) (int, error) { checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 // Read directly into r.decoded instead of via r.buf. n := chunkLen - checksumSize + if r.snappyFrame && n > maxSnappyBlockSize { + r.err = ErrCorrupt + return 0, r.err + } if n > len(r.decoded) { if n > r.maxBlock { r.err = ErrCorrupt @@ -332,7 +346,11 @@ func (r *Reader) Read(p []byte) (int, error) { if string(r.buf[:len(magicBody)]) != magicBodySnappy { r.err = ErrCorrupt return 0, r.err + } else { + r.snappyFrame = true } + } else { + r.snappyFrame = false } continue } @@ -408,7 +426,9 @@ func (r *Reader) Skip(n int64) error { return r.err } if !r.ensureBufferSize(chunkLen) { - r.err = ErrUnsupported + if r.err == nil { + r.err = ErrUnsupported + } return r.err } buf := r.buf[:chunkLen] @@ -454,7 +474,9 @@ func (r *Reader) Skip(n int64) error { return r.err } if !r.ensureBufferSize(chunkLen) { - r.err = ErrUnsupported + if r.err != nil { + r.err = ErrUnsupported + } return r.err } buf := r.buf[:checksumSize] @@ -518,3 +540,26 @@ func (r *Reader) Skip(n int64) error { } return nil } + +// ReadByte satisfies the io.ByteReader interface. +func (r *Reader) ReadByte() (byte, error) { + if r.err != nil { + return 0, r.err + } + if r.i < r.j { + c := r.decoded[r.i] + r.i++ + return c, nil + } + var tmp [1]byte + for i := 0; i < 10; i++ { + n, err := r.Read(tmp[:]) + if err != nil { + return 0, err + } + if n == 1 { + return tmp[0], nil + } + } + return 0, io.ErrNoProgress +} diff --git a/vendor/github.com/klauspost/compress/s2/decode_other.go b/vendor/github.com/klauspost/compress/s2/decode_other.go index aac33642..5da4a447 100644 --- a/vendor/github.com/klauspost/compress/s2/decode_other.go +++ b/vendor/github.com/klauspost/compress/s2/decode_other.go @@ -7,7 +7,10 @@ package s2 -import "fmt" +import ( + "fmt" + "strconv" +) // decode writes the decoding of src to dst. It assumes that the varint-encoded // length of the decompressed bytes has already been read, and that len(dst) @@ -44,7 +47,7 @@ func s2Decode(dst, src []byte) int { x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 } length = int(x) + 1 - if length > len(dst)-d || length > len(src)-s { + if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) { return decodeErrCodeCorrupt } if debug { @@ -158,7 +161,7 @@ func s2Decode(dst, src []byte) int { x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 } length = int(x) + 1 - if length > len(dst)-d || length > len(src)-s { + if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) { return decodeErrCodeCorrupt } if debug { diff --git a/vendor/github.com/klauspost/compress/s2/encode.go b/vendor/github.com/klauspost/compress/s2/encode.go index 8f89e21a..aa8b108d 100644 --- a/vendor/github.com/klauspost/compress/s2/encode.go +++ b/vendor/github.com/klauspost/compress/s2/encode.go @@ -186,6 +186,94 @@ func EncodeSnappy(dst, src []byte) []byte { return dst[:d] } +// EncodeSnappyBetter returns the encoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire encoded block. +// Otherwise, a newly allocated slice will be returned. +// +// The output is Snappy compatible and will likely decompress faster. +// +// The dst and src must not overlap. It is valid to pass a nil dst. +// +// The blocks will require the same amount of memory to decode as encoding, +// and does not make for concurrent decoding. +// Also note that blocks do not contain CRC information, so corruption may be undetected. +// +// If you need to encode larger amounts of data, consider using +// the streaming interface which gives all of these features. +func EncodeSnappyBetter(dst, src []byte) []byte { + if n := MaxEncodedLen(len(src)); n < 0 { + panic(ErrTooLarge) + } else if cap(dst) < n { + dst = make([]byte, n) + } else { + dst = dst[:n] + } + + // The block starts with the varint-encoded length of the decompressed bytes. + d := binary.PutUvarint(dst, uint64(len(src))) + + if len(src) == 0 { + return dst[:d] + } + if len(src) < minNonLiteralBlockSize { + d += emitLiteral(dst[d:], src) + return dst[:d] + } + + n := encodeBlockBetterSnappy(dst[d:], src) + if n > 0 { + d += n + return dst[:d] + } + // Not compressible + d += emitLiteral(dst[d:], src) + return dst[:d] +} + +// EncodeSnappyBest returns the encoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire encoded block. +// Otherwise, a newly allocated slice will be returned. +// +// The output is Snappy compatible and will likely decompress faster. +// +// The dst and src must not overlap. It is valid to pass a nil dst. +// +// The blocks will require the same amount of memory to decode as encoding, +// and does not make for concurrent decoding. +// Also note that blocks do not contain CRC information, so corruption may be undetected. +// +// If you need to encode larger amounts of data, consider using +// the streaming interface which gives all of these features. +func EncodeSnappyBest(dst, src []byte) []byte { + if n := MaxEncodedLen(len(src)); n < 0 { + panic(ErrTooLarge) + } else if cap(dst) < n { + dst = make([]byte, n) + } else { + dst = dst[:n] + } + + // The block starts with the varint-encoded length of the decompressed bytes. + d := binary.PutUvarint(dst, uint64(len(src))) + + if len(src) == 0 { + return dst[:d] + } + if len(src) < minNonLiteralBlockSize { + d += emitLiteral(dst[d:], src) + return dst[:d] + } + + n := encodeBlockBestSnappy(dst[d:], src) + if n > 0 { + d += n + return dst[:d] + } + // Not compressible + d += emitLiteral(dst[d:], src) + return dst[:d] +} + // ConcatBlocks will concatenate the supplied blocks and append them to the supplied destination. // If the destination is nil or too small, a new will be allocated. // The blocks are not validated, so garbage in = garbage out. @@ -322,6 +410,8 @@ type Writer struct { // wroteStreamHeader is whether we have written the stream header. wroteStreamHeader bool paramsOK bool + snappy bool + flushOnWrite bool level uint8 } @@ -410,6 +500,9 @@ func (w *Writer) Reset(writer io.Writer) { // Write satisfies the io.Writer interface. func (w *Writer) Write(p []byte) (nRet int, errRet error) { + if w.flushOnWrite { + return w.write(p) + } // If we exceed the input buffer size, start writing for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err(nil) == nil { var n int @@ -487,7 +580,9 @@ func (w *Writer) ReadFrom(r io.Reader) (n int64, err error) { // EncodeBuffer will add a buffer to the stream. // This is the fastest way to encode a stream, // but the input buffer cannot be written to by the caller -// until this function, Flush or Close has been called. +// until Flush or Close has been called when concurrency != 1. +// +// If you cannot control that, use the regular Write function. // // Note that input is not buffered. // This means that each write will result in discrete blocks being created. @@ -497,6 +592,10 @@ func (w *Writer) EncodeBuffer(buf []byte) (err error) { return err } + if w.flushOnWrite { + _, err := w.write(buf) + return err + } // Flush queued data first. if len(w.ibuf) > 0 { err := w.Flush() @@ -514,7 +613,11 @@ func (w *Writer) EncodeBuffer(buf []byte) (err error) { w.wroteStreamHeader = true hWriter := make(chan result) w.output <- hWriter - hWriter <- []byte(magicChunk) + if w.snappy { + hWriter <- []byte(magicChunkSnappy) + } else { + hWriter <- []byte(magicChunk) + } } for len(buf) > 0 { @@ -538,15 +641,7 @@ func (w *Writer) EncodeBuffer(buf []byte) (err error) { // Attempt compressing. n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed))) - var n2 int - switch w.level { - case levelFast: - n2 = encodeBlock(obuf[obufHeaderLen+n:], uncompressed) - case levelBetter: - n2 = encodeBlockBetter(obuf[obufHeaderLen+n:], uncompressed) - case levelBest: - n2 = encodeBlockBest(obuf[obufHeaderLen+n:], uncompressed) - } + n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed) // Check if we should use this, or store as uncompressed instead. if n2 > 0 { @@ -575,6 +670,29 @@ func (w *Writer) EncodeBuffer(buf []byte) (err error) { return nil } +func (w *Writer) encodeBlock(obuf, uncompressed []byte) int { + if w.snappy { + switch w.level { + case levelFast: + return encodeBlockSnappy(obuf, uncompressed) + case levelBetter: + return encodeBlockBetterSnappy(obuf, uncompressed) + case levelBest: + return encodeBlockBestSnappy(obuf, uncompressed) + } + return 0 + } + switch w.level { + case levelFast: + return encodeBlock(obuf, uncompressed) + case levelBetter: + return encodeBlockBetter(obuf, uncompressed) + case levelBest: + return encodeBlockBest(obuf, uncompressed) + } + return 0 +} + func (w *Writer) write(p []byte) (nRet int, errRet error) { if err := w.err(nil); err != nil { return 0, err @@ -589,7 +707,11 @@ func (w *Writer) write(p []byte) (nRet int, errRet error) { w.wroteStreamHeader = true hWriter := make(chan result) w.output <- hWriter - hWriter <- []byte(magicChunk) + if w.snappy { + hWriter <- []byte(magicChunkSnappy) + } else { + hWriter <- []byte(magicChunk) + } } var uncompressed []byte @@ -618,15 +740,7 @@ func (w *Writer) write(p []byte) (nRet int, errRet error) { // Attempt compressing. n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed))) - var n2 int - switch w.level { - case levelFast: - n2 = encodeBlock(obuf[obufHeaderLen+n:], uncompressed) - case levelBetter: - n2 = encodeBlockBetter(obuf[obufHeaderLen+n:], uncompressed) - case levelBest: - n2 = encodeBlockBest(obuf[obufHeaderLen+n:], uncompressed) - } + n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed) // Check if we should use this, or store as uncompressed instead. if n2 > 0 { @@ -678,7 +792,11 @@ func (w *Writer) writeFull(inbuf []byte) (errRet error) { w.wroteStreamHeader = true hWriter := make(chan result) w.output <- hWriter - hWriter <- []byte(magicChunk) + if w.snappy { + hWriter <- []byte(magicChunkSnappy) + } else { + hWriter <- []byte(magicChunk) + } } // Get an output buffer. @@ -697,15 +815,7 @@ func (w *Writer) writeFull(inbuf []byte) (errRet error) { // Attempt compressing. n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed))) - var n2 int - switch w.level { - case levelFast: - n2 = encodeBlock(obuf[obufHeaderLen+n:], uncompressed) - case levelBetter: - n2 = encodeBlockBetter(obuf[obufHeaderLen+n:], uncompressed) - case levelBest: - n2 = encodeBlockBest(obuf[obufHeaderLen+n:], uncompressed) - } + n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed) // Check if we should use this, or store as uncompressed instead. if n2 > 0 { @@ -742,7 +852,13 @@ func (w *Writer) writeSync(p []byte) (nRet int, errRet error) { } if !w.wroteStreamHeader { w.wroteStreamHeader = true - n, err := w.writer.Write([]byte(magicChunk)) + var n int + var err error + if w.snappy { + n, err = w.writer.Write([]byte(magicChunkSnappy)) + } else { + n, err = w.writer.Write([]byte(magicChunk)) + } if err != nil { return 0, w.err(err) } @@ -769,15 +885,7 @@ func (w *Writer) writeSync(p []byte) (nRet int, errRet error) { // Attempt compressing. n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed))) - var n2 int - switch w.level { - case levelFast: - n2 = encodeBlock(obuf[obufHeaderLen+n:], uncompressed) - case levelBetter: - n2 = encodeBlockBetter(obuf[obufHeaderLen+n:], uncompressed) - case levelBest: - n2 = encodeBlockBest(obuf[obufHeaderLen+n:], uncompressed) - } + n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed) if n2 > 0 { chunkType = uint8(chunkTypeCompressedData) @@ -832,11 +940,17 @@ func (w *Writer) Flush() error { // Queue any data still in input buffer. if len(w.ibuf) != 0 { - _, err := w.write(w.ibuf) - w.ibuf = w.ibuf[:0] - err = w.err(err) - if err != nil { - return err + if !w.wroteStreamHeader { + _, err := w.writeSync(w.ibuf) + w.ibuf = w.ibuf[:0] + return w.err(err) + } else { + _, err := w.write(w.ibuf) + w.ibuf = w.ibuf[:0] + err = w.err(err) + if err != nil { + return err + } } } if w.output == nil { @@ -981,8 +1095,14 @@ func WriterUncompressed() WriterOption { // and will increase compression slightly, but it will limit the possible // concurrency for smaller payloads for both encoding and decoding. // Default block size is 1MB. +// +// When writing Snappy compatible output using WriterSnappyCompat, +// the maximum block size is 64KB. func WriterBlockSize(n int) WriterOption { return func(w *Writer) error { + if w.snappy && n > maxSnappyBlockSize || n < minBlockSize { + return errors.New("s2: block size too large. Must be <= 64K and >=4KB on for snappy compatible output") + } if n > maxBlockSize || n < minBlockSize { return errors.New("s2: block size too large. Must be <= 4MB and >=4KB") } @@ -1022,3 +1142,31 @@ func WriterPaddingSrc(reader io.Reader) WriterOption { return nil } } + +// WriterSnappyCompat will write snappy compatible output. +// The output can be decompressed using either snappy or s2. +// If block size is more than 64KB it is set to that. +func WriterSnappyCompat() WriterOption { + return func(w *Writer) error { + w.snappy = true + if w.blockSize > 64<<10 { + // We choose 8 bytes less than 64K, since that will make literal emits slightly more effective. + // And allows us to skip some size checks. + w.blockSize = (64 << 10) - 8 + } + return nil + } +} + +// WriterFlushOnWrite will compress blocks on each call to the Write function. +// +// This is quite inefficient as blocks size will depend on the write size. +// +// Use WriterConcurrency(1) to also make sure that output is flushed. +// When Write calls return, otherwise they will be written when compression is done. +func WriterFlushOnWrite() WriterOption { + return func(w *Writer) error { + w.flushOnWrite = true + return nil + } +} diff --git a/vendor/github.com/klauspost/compress/s2/encode_all.go b/vendor/github.com/klauspost/compress/s2/encode_all.go index f1d27b6a..8b16c38a 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_all.go +++ b/vendor/github.com/klauspost/compress/s2/encode_all.go @@ -12,16 +12,11 @@ import ( ) func load32(b []byte, i int) uint32 { - b = b[i:] - b = b[:4] - return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 + return binary.LittleEndian.Uint32(b[i:]) } func load64(b []byte, i int) uint64 { - b = b[i:] - b = b[:8] - return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | - uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 + return binary.LittleEndian.Uint64(b[i:]) } // hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits. @@ -274,3 +269,188 @@ emitRemainder: } return d } + +func encodeBlockSnappyGo(dst, src []byte) (d int) { + // Initialize the hash table. + const ( + tableBits = 14 + maxTableSize = 1 << tableBits + ) + + var table [maxTableSize]uint32 + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + + // Bail if we can't compress to at least this. + dstLimit := len(src) - len(src)>>5 - 5 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + // We search for a repeat at -1, but don't output repeats when nextEmit == 0 + repeat := 1 + + for { + candidate := 0 + for { + // Next src position to check + nextS := s + (s-nextEmit)>>6 + 4 + if nextS > sLimit { + goto emitRemainder + } + hash0 := hash6(cv, tableBits) + hash1 := hash6(cv>>8, tableBits) + candidate = int(table[hash0]) + candidate2 := int(table[hash1]) + table[hash0] = uint32(s) + table[hash1] = uint32(s + 1) + hash2 := hash6(cv>>16, tableBits) + + // Check repeat at offset checkRep. + const checkRep = 1 + if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { + base := s + checkRep + // Extend back + for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { + i-- + base-- + } + d += emitLiteral(dst[d:], src[nextEmit:base]) + + // Extend forward + candidate := s - repeat + 4 + checkRep + s += 4 + checkRep + for s <= sLimit { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + + d += emitCopyNoRepeat(dst[d:], repeat, s-base) + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + cv = load64(src, s) + continue + } + + if uint32(cv) == load32(src, candidate) { + break + } + candidate = int(table[hash2]) + if uint32(cv>>8) == load32(src, candidate2) { + table[hash2] = uint32(s + 2) + candidate = candidate2 + s++ + break + } + table[hash2] = uint32(s + 2) + if uint32(cv>>16) == load32(src, candidate) { + s += 2 + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards + for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] { + candidate-- + s-- + } + + // Bail if we exceed the maximum size. + if d+(s-nextEmit) > dstLimit { + return 0 + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + d += emitLiteral(dst[d:], src[nextEmit:s]) + + // Call emitCopy, and then see if another emitCopy could be our next + // move. Repeat until we find no match for the input immediately after + // what was consumed by the last emitCopy call. + // + // If we exit this loop normally then we need to call emitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can + // exit this loop via goto if we get close to exhausting the input. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + base := s + repeat = base - candidate + + // Extend the 4-byte match as long as possible. + s += 4 + candidate += 4 + for s <= len(src)-8 { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + + d += emitCopyNoRepeat(dst[d:], repeat, s-base) + if false { + // Validate match. + a := src[base:s] + b := src[base-repeat : base-repeat+(s-base)] + if !bytes.Equal(a, b) { + panic("mismatch") + } + } + + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + // Check for an immediate match, otherwise start search at s+1 + x := load64(src, s-2) + m2Hash := hash6(x, tableBits) + currHash := hash6(x>>16, tableBits) + candidate = int(table[currHash]) + table[m2Hash] = uint32(s - 2) + table[currHash] = uint32(s) + if uint32(x>>16) != load32(src, candidate) { + cv = load64(src, s+1) + s++ + break + } + } + } + +emitRemainder: + if nextEmit < len(src) { + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} diff --git a/vendor/github.com/klauspost/compress/s2/encode_amd64.go b/vendor/github.com/klauspost/compress/s2/encode_amd64.go index e5b47a7a..f67820a1 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_amd64.go +++ b/vendor/github.com/klauspost/compress/s2/encode_amd64.go @@ -90,9 +90,12 @@ func encodeBlockSnappy(dst, src []byte) (d int) { // Use 8 bit table when less than... limit8B = 512 ) - if len(src) >= limit12B { + if len(src) >= 64<<10 { return encodeSnappyBlockAsm(dst, src) } + if len(src) >= limit12B { + return encodeSnappyBlockAsm64K(dst, src) + } if len(src) >= limit10B { return encodeSnappyBlockAsm12B(dst, src) } @@ -104,3 +107,37 @@ func encodeBlockSnappy(dst, src []byte) (d int) { } return encodeSnappyBlockAsm8B(dst, src) } + +// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlockBetterSnappy(dst, src []byte) (d int) { + const ( + // Use 12 bit table when less than... + limit12B = 16 << 10 + // Use 10 bit table when less than... + limit10B = 4 << 10 + // Use 8 bit table when less than... + limit8B = 512 + ) + if len(src) >= 64<<10 { + return encodeSnappyBetterBlockAsm(dst, src) + } + if len(src) >= limit12B { + return encodeSnappyBetterBlockAsm64K(dst, src) + } + if len(src) >= limit10B { + return encodeSnappyBetterBlockAsm12B(dst, src) + } + if len(src) >= limit8B { + return encodeSnappyBetterBlockAsm10B(dst, src) + } + if len(src) < minNonLiteralBlockSize { + return 0 + } + return encodeSnappyBetterBlockAsm8B(dst, src) +} diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go index 4b571bf3..44803477 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_best.go +++ b/vendor/github.com/klauspost/compress/s2/encode_best.go @@ -31,9 +31,6 @@ func encodeBlockBest(dst, src []byte) (d int) { inputMargin = 8 + 2 ) - var lTable [maxLTableSize]uint64 - var sTable [maxSTableSize]uint64 - // sLimit is when to stop looking for offset/length copies. The inputMargin // lets us use a fast path for emitLiteral in the main loop, while we are // looking for copies. @@ -42,6 +39,9 @@ func encodeBlockBest(dst, src []byte) (d int) { return 0 } + var lTable [maxLTableSize]uint64 + var sTable [maxSTableSize]uint64 + // Bail if we can't compress to at least this. dstLimit := len(src) - 5 @@ -62,18 +62,25 @@ func encodeBlockBest(dst, src []byte) (d int) { getPrev := func(x uint64) int { return int(x >> 32) } + const maxSkip = 64 for { type match struct { offset int s int length int + score int rep bool } var best match for { // Next src position to check - nextS := s + (s-nextEmit)>>8 + 1 + nextS := (s-nextEmit)>>8 + 1 + if nextS > maxSkip { + nextS = s + maxSkip + } else { + nextS += s + } if nextS > sLimit { goto emitRemainder } @@ -82,6 +89,20 @@ func encodeBlockBest(dst, src []byte) (d int) { candidateL := lTable[hashL] candidateS := sTable[hashS] + score := func(m match) int { + // Matches that are longer forward are penalized since we must emit it as a literal. + score := m.length - m.s + if nextEmit == m.s { + // If we do not have to emit literals, we save 1 byte + score++ + } + offset := m.s - m.offset + if m.rep { + return score - emitRepeatSize(offset, m.length) + } + return score - emitCopySize(offset, m.length) + } + matchAt := func(offset, s int, first uint32, rep bool) match { if best.length != 0 && best.s-best.offset == s-offset { // Don't retest if we have the same offset. @@ -90,41 +111,35 @@ func encodeBlockBest(dst, src []byte) (d int) { if load32(src, offset) != first { return match{offset: offset, s: s} } - m := match{offset: offset, s: s, length: 4, rep: rep} + m := match{offset: offset, s: s, length: 4 + offset, rep: rep} s += 4 for s <= sLimit { - if diff := load64(src, s) ^ load64(src, offset+m.length); diff != 0 { + if diff := load64(src, s) ^ load64(src, m.length); diff != 0 { m.length += bits.TrailingZeros64(diff) >> 3 break } s += 8 m.length += 8 } + m.length -= offset + m.score = score(m) + if m.score <= -m.s { + // Eliminate if no savings, we might find a better one. + m.length = 0 + } return m } bestOf := func(a, b match) match { - aScore := b.s - a.s + a.length - bScore := a.s - b.s + b.length - if !a.rep { - // Estimate bytes needed to store offset. - offset := a.s - a.offset - if offset >= 65536 { - aScore -= 5 - } else { - aScore -= 3 - } + if b.length == 0 { + return a } - if !b.rep { - // Estimate bytes needed to store offset. - offset := b.s - b.offset - if offset >= 65536 { - bScore -= 5 - } else { - bScore -= 3 - } + if a.length == 0 { + return b } - if aScore >= bScore { + as := a.score + b.s + bs := b.score + a.s + if as >= bs { return a } return b @@ -160,6 +175,23 @@ func encodeBlockBest(dst, src []byte) (d int) { best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false)) best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false)) } + // Search for a match at best match end, see if that is better. + if sAt := best.s + best.length; sAt < sLimit { + sBack := best.s + backL := best.length + // Load initial values + cv = load64(src, sBack) + // Search for mismatch + next := lTable[hash8(load64(src, sAt), lTableBits)] + //next := sTable[hash4(load64(src, sAt), sTableBits)] + + if checkAt := getCur(next) - backL; checkAt > 0 { + best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) + } + if checkAt := getPrev(next) - backL; checkAt > 0 { + best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) + } + } } } @@ -250,3 +282,323 @@ emitRemainder: } return d } + +// encodeBlockBestSnappy encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlockBestSnappy(dst, src []byte) (d int) { + // Initialize the hash tables. + const ( + // Long hash matches. + lTableBits = 19 + maxLTableSize = 1 << lTableBits + + // Short hash matches. + sTableBits = 16 + maxSTableSize = 1 << sTableBits + + inputMargin = 8 + 2 + ) + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + if len(src) < minNonLiteralBlockSize { + return 0 + } + + var lTable [maxLTableSize]uint64 + var sTable [maxSTableSize]uint64 + + // Bail if we can't compress to at least this. + dstLimit := len(src) - 5 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + // We search for a repeat at -1, but don't output repeats when nextEmit == 0 + repeat := 1 + const lowbitMask = 0xffffffff + getCur := func(x uint64) int { + return int(x & lowbitMask) + } + getPrev := func(x uint64) int { + return int(x >> 32) + } + const maxSkip = 64 + + for { + type match struct { + offset int + s int + length int + score int + } + var best match + for { + // Next src position to check + nextS := (s-nextEmit)>>8 + 1 + if nextS > maxSkip { + nextS = s + maxSkip + } else { + nextS += s + } + if nextS > sLimit { + goto emitRemainder + } + hashL := hash8(cv, lTableBits) + hashS := hash4(cv, sTableBits) + candidateL := lTable[hashL] + candidateS := sTable[hashS] + + score := func(m match) int { + // Matches that are longer forward are penalized since we must emit it as a literal. + score := m.length - m.s + if nextEmit == m.s { + // If we do not have to emit literals, we save 1 byte + score++ + } + offset := m.s - m.offset + + return score - emitCopySize(offset, m.length) + } + + matchAt := func(offset, s int, first uint32) match { + if best.length != 0 && best.s-best.offset == s-offset { + // Don't retest if we have the same offset. + return match{offset: offset, s: s} + } + if load32(src, offset) != first { + return match{offset: offset, s: s} + } + m := match{offset: offset, s: s, length: 4 + offset} + s += 4 + for s <= sLimit { + if diff := load64(src, s) ^ load64(src, m.length); diff != 0 { + m.length += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + m.length += 8 + } + m.length -= offset + m.score = score(m) + if m.score <= -m.s { + // Eliminate if no savings, we might find a better one. + m.length = 0 + } + return m + } + + bestOf := func(a, b match) match { + if b.length == 0 { + return a + } + if a.length == 0 { + return b + } + as := a.score + b.s + bs := b.score + a.s + if as >= bs { + return a + } + return b + } + + best = bestOf(matchAt(getCur(candidateL), s, uint32(cv)), matchAt(getPrev(candidateL), s, uint32(cv))) + best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv))) + best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv))) + + { + best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8))) + if best.length > 0 { + // s+1 + nextShort := sTable[hash4(cv>>8, sTableBits)] + s := s + 1 + cv := load64(src, s) + nextLong := lTable[hash8(cv, lTableBits)] + best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv))) + best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv))) + best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv))) + best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv))) + // Repeat at + 2 + best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8))) + + // s+2 + if true { + nextShort = sTable[hash4(cv>>8, sTableBits)] + s++ + cv = load64(src, s) + nextLong = lTable[hash8(cv, lTableBits)] + best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv))) + best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv))) + best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv))) + best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv))) + } + // Search for a match at best match end, see if that is better. + if sAt := best.s + best.length; sAt < sLimit { + sBack := best.s + backL := best.length + // Load initial values + cv = load64(src, sBack) + // Search for mismatch + next := lTable[hash8(load64(src, sAt), lTableBits)] + //next := sTable[hash4(load64(src, sAt), sTableBits)] + + if checkAt := getCur(next) - backL; checkAt > 0 { + best = bestOf(best, matchAt(checkAt, sBack, uint32(cv))) + } + if checkAt := getPrev(next) - backL; checkAt > 0 { + best = bestOf(best, matchAt(checkAt, sBack, uint32(cv))) + } + } + } + } + + // Update table + lTable[hashL] = uint64(s) | candidateL<<32 + sTable[hashS] = uint64(s) | candidateS<<32 + + if best.length > 0 { + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards, not needed for repeats... + s = best.s + if true { + for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] { + best.offset-- + best.length++ + s-- + } + } + if false && best.offset >= s { + panic(fmt.Errorf("t %d >= s %d", best.offset, s)) + } + // Bail if we exceed the maximum size. + if d+(s-nextEmit) > dstLimit { + return 0 + } + + base := s + offset := s - best.offset + + s += best.length + + if offset > 65535 && s-base <= 5 { + // Bail if the match is equal or worse to the encoding. + s = best.s + 1 + if s >= sLimit { + goto emitRemainder + } + cv = load64(src, s) + continue + } + d += emitLiteral(dst[d:], src[nextEmit:base]) + d += emitCopyNoRepeat(dst[d:], offset, best.length) + repeat = offset + + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + // Fill tables... + for i := best.s + 1; i < s; i++ { + cv0 := load64(src, i) + long0 := hash8(cv0, lTableBits) + short0 := hash4(cv0, sTableBits) + lTable[long0] = uint64(i) | lTable[long0]<<32 + sTable[short0] = uint64(i) | sTable[short0]<<32 + } + cv = load64(src, s) + } + +emitRemainder: + if nextEmit < len(src) { + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} + +// emitCopySize returns the size to encode the offset+length +// +// It assumes that: +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 +func emitCopySize(offset, length int) int { + if offset >= 65536 { + i := 0 + if length > 64 { + length -= 64 + if length >= 4 { + // Emit remaining as repeats + return 5 + emitRepeatSize(offset, length) + } + i = 5 + } + if length == 0 { + return i + } + return i + 5 + } + + // Offset no more than 2 bytes. + if length > 64 { + // Emit remaining as repeats, at least 4 bytes remain. + return 3 + emitRepeatSize(offset, length-60) + } + if length >= 12 || offset >= 2048 { + return 3 + } + // Emit the remaining copy, encoded as 2 bytes. + return 2 +} + +// emitRepeatSize returns the number of bytes required to encode a repeat. +// Length must be at least 4 and < 1<<24 +func emitRepeatSize(offset, length int) int { + // Repeat offset, make length cheaper + if length <= 4+4 || (length < 8+4 && offset < 2048) { + return 2 + } + if length < (1<<8)+4+4 { + return 3 + } + if length < (1<<16)+(1<<8)+4 { + return 4 + } + const maxRepeat = (1 << 24) - 1 + length -= (1 << 16) - 4 + left := 0 + if length > maxRepeat { + left = length - maxRepeat + 4 + length = maxRepeat - 4 + } + if left > 0 { + return 5 + emitRepeatSize(offset, left) + } + return 5 +} diff --git a/vendor/github.com/klauspost/compress/s2/encode_better.go b/vendor/github.com/klauspost/compress/s2/encode_better.go index 28bf5c07..943215b8 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_better.go +++ b/vendor/github.com/klauspost/compress/s2/encode_better.go @@ -45,6 +45,14 @@ func hash8(u uint64, h uint8) uint32 { // len(dst) >= MaxEncodedLen(len(src)) && // minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockBetterGo(dst, src []byte) (d int) { + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + if len(src) < minNonLiteralBlockSize { + return 0 + } + // Initialize the hash tables. const ( // Long hash matches. @@ -59,14 +67,6 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { var lTable [maxLTableSize]uint32 var sTable [maxSTableSize]uint32 - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := len(src) - inputMargin - if len(src) < minNonLiteralBlockSize { - return 0 - } - // Bail if we can't compress to at least this. dstLimit := len(src) - len(src)>>5 - 6 @@ -111,7 +111,15 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { // Extend forward candidate := s - repeat + 4 + checkRep s += 4 + checkRep - for s <= sLimit { + for s < len(src) { + if len(src)-s < 8 { + if src[s] == src[candidate] { + s++ + candidate++ + continue + } + break + } if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { s += bits.TrailingZeros64(diff) >> 3 break @@ -175,7 +183,15 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { // Extend the 4-byte match as long as possible. s += 4 candidateL += 4 - for s <= len(src)-8 { + for s < len(src) { + if len(src)-s < 8 { + if src[s] == src[candidateL] { + s++ + candidateL++ + continue + } + break + } if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 { s += bits.TrailingZeros64(diff) >> 3 break @@ -193,6 +209,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { cv = load64(src, s) continue } + d += emitLiteral(dst[d:], src[nextEmit:base]) if repeat == offset { d += emitRepeat(dst[d:], offset, s-base) @@ -219,8 +236,186 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { cv1 := load64(src, index1) cv = load64(src, s) lTable[hash7(cv0, lTableBits)] = uint32(index0) + lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1) lTable[hash7(cv1, lTableBits)] = uint32(index1) + lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1) sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) + sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2) + sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + } + +emitRemainder: + if nextEmit < len(src) { + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} + +// encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlockBetterSnappyGo(dst, src []byte) (d int) { + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + if len(src) < minNonLiteralBlockSize { + return 0 + } + + // Initialize the hash tables. + const ( + // Long hash matches. + lTableBits = 16 + maxLTableSize = 1 << lTableBits + + // Short hash matches. + sTableBits = 14 + maxSTableSize = 1 << sTableBits + ) + + var lTable [maxLTableSize]uint32 + var sTable [maxSTableSize]uint32 + + // Bail if we can't compress to at least this. + dstLimit := len(src) - len(src)>>5 - 6 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + // We initialize repeat to 0, so we never match on first attempt + repeat := 0 + const maxSkip = 100 + + for { + candidateL := 0 + nextS := 0 + for { + // Next src position to check + nextS = (s-nextEmit)>>7 + 1 + if nextS > maxSkip { + nextS = s + maxSkip + } else { + nextS += s + } + + if nextS > sLimit { + goto emitRemainder + } + hashL := hash7(cv, lTableBits) + hashS := hash4(cv, sTableBits) + candidateL = int(lTable[hashL]) + candidateS := int(sTable[hashS]) + lTable[hashL] = uint32(s) + sTable[hashS] = uint32(s) + + if uint32(cv) == load32(src, candidateL) { + break + } + + // Check our short candidate + if uint32(cv) == load32(src, candidateS) { + // Try a long candidate at s+1 + hashL = hash7(cv>>8, lTableBits) + candidateL = int(lTable[hashL]) + lTable[hashL] = uint32(s + 1) + if uint32(cv>>8) == load32(src, candidateL) { + s++ + break + } + // Use our short candidate. + candidateL = candidateS + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards + for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] { + candidateL-- + s-- + } + + // Bail if we exceed the maximum size. + if d+(s-nextEmit) > dstLimit { + return 0 + } + + base := s + offset := base - candidateL + + // Extend the 4-byte match as long as possible. + s += 4 + candidateL += 4 + for s < len(src) { + if len(src)-s < 8 { + if src[s] == src[candidateL] { + s++ + candidateL++ + continue + } + break + } + if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidateL += 8 + } + + if offset > 65535 && s-base <= 5 && repeat != offset { + // Bail if the match is equal or worse to the encoding. + s = nextS + 1 + if s >= sLimit { + goto emitRemainder + } + cv = load64(src, s) + continue + } + + d += emitLiteral(dst[d:], src[nextEmit:base]) + d += emitCopyNoRepeat(dst[d:], offset, s-base) + repeat = offset + + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + // Index match start+1 (long) and start+2 (short) + index0 := base + 1 + // Index match end-2 (long) and end-1 (short) + index1 := s - 2 + + cv0 := load64(src, index0) + cv1 := load64(src, index1) + cv = load64(src, s) + lTable[hash7(cv0, lTableBits)] = uint32(index0) + lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1) + lTable[hash7(cv1, lTableBits)] = uint32(index1) + lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1) + sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) + sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2) sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) } diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go index 8be2b8f8..c45a89dd 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_go.go +++ b/vendor/github.com/klauspost/compress/s2/encode_go.go @@ -3,7 +3,6 @@ package s2 import ( - "bytes" "math/bits" ) @@ -30,6 +29,29 @@ func encodeBlockBetter(dst, src []byte) (d int) { return encodeBlockBetterGo(dst, src) } +// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) +func encodeBlockBetterSnappy(dst, src []byte) (d int) { + return encodeBlockBetterSnappyGo(dst, src) +} + +// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) +func encodeBlockSnappy(dst, src []byte) (d int) { + if len(src) < minNonLiteralBlockSize { + return 0 + } + return encodeBlockSnappyGo(dst, src) +} + // emitLiteral writes a literal chunk and returns the number of bytes written. // // It assumes that: @@ -273,188 +295,3 @@ func matchLen(a []byte, b []byte) int { } return len(a) + checked } - -func encodeBlockSnappy(dst, src []byte) (d int) { - // Initialize the hash table. - const ( - tableBits = 14 - maxTableSize = 1 << tableBits - ) - - var table [maxTableSize]uint32 - - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := len(src) - inputMargin - - // Bail if we can't compress to at least this. - dstLimit := len(src) - len(src)>>5 - 5 - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := 0 - - // The encoded form must start with a literal, as there are no previous - // bytes to copy, so we start looking for hash matches at s == 1. - s := 1 - cv := load64(src, s) - - // We search for a repeat at -1, but don't output repeats when nextEmit == 0 - repeat := 1 - - for { - candidate := 0 - for { - // Next src position to check - nextS := s + (s-nextEmit)>>6 + 4 - if nextS > sLimit { - goto emitRemainder - } - hash0 := hash6(cv, tableBits) - hash1 := hash6(cv>>8, tableBits) - candidate = int(table[hash0]) - candidate2 := int(table[hash1]) - table[hash0] = uint32(s) - table[hash1] = uint32(s + 1) - hash2 := hash6(cv>>16, tableBits) - - // Check repeat at offset checkRep. - const checkRep = 1 - if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { - base := s + checkRep - // Extend back - for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { - i-- - base-- - } - d += emitLiteral(dst[d:], src[nextEmit:base]) - - // Extend forward - candidate := s - repeat + 4 + checkRep - s += 4 + checkRep - for s <= sLimit { - if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { - s += bits.TrailingZeros64(diff) >> 3 - break - } - s += 8 - candidate += 8 - } - - d += emitCopyNoRepeat(dst[d:], repeat, s-base) - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - cv = load64(src, s) - continue - } - - if uint32(cv) == load32(src, candidate) { - break - } - candidate = int(table[hash2]) - if uint32(cv>>8) == load32(src, candidate2) { - table[hash2] = uint32(s + 2) - candidate = candidate2 - s++ - break - } - table[hash2] = uint32(s + 2) - if uint32(cv>>16) == load32(src, candidate) { - s += 2 - break - } - - cv = load64(src, nextS) - s = nextS - } - - // Extend backwards - for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] { - candidate-- - s-- - } - - // Bail if we exceed the maximum size. - if d+(s-nextEmit) > dstLimit { - return 0 - } - - // A 4-byte match has been found. We'll later see if more than 4 bytes - // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit - // them as literal bytes. - - d += emitLiteral(dst[d:], src[nextEmit:s]) - - // Call emitCopy, and then see if another emitCopy could be our next - // move. Repeat until we find no match for the input immediately after - // what was consumed by the last emitCopy call. - // - // If we exit this loop normally then we need to call emitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can - // exit this loop via goto if we get close to exhausting the input. - for { - // Invariant: we have a 4-byte match at s, and no need to emit any - // literal bytes prior to s. - base := s - repeat = base - candidate - - // Extend the 4-byte match as long as possible. - s += 4 - candidate += 4 - for s <= len(src)-8 { - if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { - s += bits.TrailingZeros64(diff) >> 3 - break - } - s += 8 - candidate += 8 - } - - d += emitCopyNoRepeat(dst[d:], repeat, s-base) - if false { - // Validate match. - a := src[base:s] - b := src[base-repeat : base-repeat+(s-base)] - if !bytes.Equal(a, b) { - panic("mismatch") - } - } - - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - if d > dstLimit { - // Do we have space for more, if not bail. - return 0 - } - // Check for an immediate match, otherwise start search at s+1 - x := load64(src, s-2) - m2Hash := hash6(x, tableBits) - currHash := hash6(x>>16, tableBits) - candidate = int(table[currHash]) - table[m2Hash] = uint32(s - 2) - table[currHash] = uint32(s) - if uint32(x>>16) != load32(src, candidate) { - cv = load64(src, s+1) - s++ - break - } - } - } - -emitRemainder: - if nextEmit < len(src) { - // Bail if we exceed the maximum size. - if d+len(src)-nextEmit > dstLimit { - return 0 - } - d += emitLiteral(dst[d:], src[nextEmit:]) - } - return d -} diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go index 9ab3c7ae..45caf6de 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go @@ -83,6 +83,13 @@ func encodeBetterBlockAsm8B(dst []byte, src []byte) int //go:noescape func encodeSnappyBlockAsm(dst []byte, src []byte) int +// encodeSnappyBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 65535 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBlockAsm64K(dst []byte, src []byte) int + // encodeSnappyBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 16383 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. @@ -104,10 +111,45 @@ func encodeSnappyBlockAsm10B(dst []byte, src []byte) int //go:noescape func encodeSnappyBlockAsm8B(dst []byte, src []byte) int +// encodeSnappyBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4294967295 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int + +// encodeSnappyBetterBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 65535 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int + +// encodeSnappyBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 16383 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int + +// encodeSnappyBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4095 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int + +// encodeSnappyBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 511 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int + // emitLiteral writes a literal chunk and returns the number of bytes written. // // It assumes that: -// dst is long enough to hold the encoded bytes +// dst is long enough to hold the encoded bytes with margin of 0 bytes // 0 <= len(lit) && len(lit) <= math.MaxUint32 // //go:noescape diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s index 748c1c2e..1ac65a0e 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s @@ -28,9 +28,9 @@ zero_loop_encodeBlockAsm: JNZ zero_loop_encodeBlockAsm MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX - LEAQ -5(CX), DX - LEAQ -8(CX), BP - MOVL BP, 8(SP) + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -40,285 +40,266 @@ zero_loop_encodeBlockAsm: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm: - MOVQ (DX)(CX*1), SI - MOVL CX, BP - SUBL 12(SP), BP - SHRL $0x06, BP - LEAL 4(CX)(BP*1), BP - CMPL BP, 8(SP) + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) JGE emit_remainder_encodeBlockAsm - MOVL BP, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 SHLQ $0x10, R10 - IMULQ R8, R10 + IMULQ R9, R10 SHRQ $0x32, R10 - MOVL 24(SP)(R9*4), BP - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 + SHLQ $0x10, R11 + IMULQ R9, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm - LEAL 1(CX), SI - MOVL 12(SP), DI - MOVL SI, BP - SUBL 16(SP), BP + LEAL 1(CX), DI + MOVL 12(SP), R8 + MOVL DI, SI + SUBL 16(SP), SI JZ repeat_extend_back_end_encodeBlockAsm repeat_extend_back_loop_encodeBlockAsm: - CMPL SI, DI + CMPL DI, R8 JLE repeat_extend_back_end_encodeBlockAsm - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(SI*1), R8 - CMPB BL, R8 + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 JNE repeat_extend_back_end_encodeBlockAsm - LEAL -1(SI), SI - DECL BP + LEAL -1(DI), DI + DECL SI JNZ repeat_extend_back_loop_encodeBlockAsm repeat_extend_back_end_encodeBlockAsm: - MOVL 12(SP), BP - CMPL BP, SI + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsm - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, R8 - LEAL -1(R8), BP - CMPL BP, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c JLT one_byte_repeat_emit_encodeBlockAsm - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsm - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JLT three_bytes_repeat_emit_encodeBlockAsm - CMPL BP, $0x01000000 + CMPL SI, $0x01000000 JLT four_bytes_repeat_emit_encodeBlockAsm MOVB $0xfc, (AX) - MOVL BP, 1(AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP memmove_long_repeat_emit_encodeBlockAsm four_bytes_repeat_emit_encodeBlockAsm: - MOVL BP, R10 - SHRL $0x10, R10 + MOVL SI, R11 + SHRL $0x10, R11 MOVB $0xf8, (AX) - MOVW BP, 1(AX) - MOVB R10, 3(AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) ADDQ $0x04, AX JMP memmove_long_repeat_emit_encodeBlockAsm three_bytes_repeat_emit_encodeBlockAsm: MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeBlockAsm two_bytes_repeat_emit_encodeBlockAsm: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_repeat_emit_encodeBlockAsm JMP memmove_long_repeat_emit_encodeBlockAsm one_byte_repeat_emit_encodeBlockAsm: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8 + CMPQ R9, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_1or2: - MOVB (R9), R10 - MOVB -1(R9)(R8*1), R9 - MOVB R10, (AX) - MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_3: - MOVW (R9), R10 - MOVB 2(R9), R9 - MOVW R10, (AX) - MOVB R9, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) +emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_repeat_emit_encodeBlockAsm: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_repeat_emit_encodeBlockAsm memmove_long_repeat_emit_encodeBlockAsm: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 + LEAQ -32(R10)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 + DECQ R12 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 + MOVOU -32(R10)(R13*1), X4 + MOVOU -16(R10)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R9, R13 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX emit_literal_done_repeat_emit_encodeBlockAsm: ADDL $0x05, CX - MOVL CX, BP - SUBL 16(SP), BP - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(BP*1), BP + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R9 + SUBL CX, R9 + LEAQ (DX)(CX*1), R10 + LEAQ (DX)(SI*1), SI // matchLen - XORL R11, R11 - CMPL R8, $0x08 + XORL R12, R12 + CMPL R9, $0x08 JL matchlen_single_repeat_extend_encodeBlockAsm matchlen_loopback_repeat_extend_encodeBlockAsm: - MOVQ (R9)(R11*1), R10 - XORQ (BP)(R11*1), R10 - TESTQ R10, R10 + MOVQ (R10)(R12*1), R11 + XORQ (SI)(R12*1), R11 + TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 JMP repeat_extend_forward_end_encodeBlockAsm matchlen_loop_repeat_extend_encodeBlockAsm: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 + LEAL -8(R9), R9 + LEAL 8(R12), R12 + CMPL R9, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm matchlen_single_repeat_extend_encodeBlockAsm: - TESTL R8, R8 + TESTL R9, R9 JZ repeat_extend_forward_end_encodeBlockAsm matchlen_single_loopback_repeat_extend_encodeBlockAsm: - MOVB (R9)(R11*1), R10 - CMPB (BP)(R11*1), R10 + MOVB (R10)(R12*1), R11 + CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm - LEAL 1(R11), R11 - DECL R8 + LEAL 1(R12), R12 + DECL R9 JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm repeat_extend_forward_end_encodeBlockAsm: - ADDL R11, CX - MOVL CX, BP - SUBL SI, BP - MOVL 16(SP), SI - TESTL DI, DI + ADDL R12, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsm // emitRepeat emit_repeat_again_match_repeat_encodeBlockAsm: - MOVL BP, DI - LEAL -4(BP), BP - CMPL DI, $0x08 + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsm cant_repeat_two_offset_match_repeat_encodeBlockAsm: - CMPL BP, $0x00000104 + CMPL SI, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm - CMPL BP, $0x00010100 + CMPL SI, $0x00010100 JLT repeat_four_match_repeat_encodeBlockAsm - CMPL BP, $0x0100ffff + CMPL SI, $0x0100ffff JLT repeat_five_match_repeat_encodeBlockAsm - LEAL -16842747(BP), BP + LEAL -16842747(SI), SI MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -326,81 +307,81 @@ cant_repeat_two_offset_match_repeat_encodeBlockAsm: JMP emit_repeat_again_match_repeat_encodeBlockAsm repeat_five_match_repeat_encodeBlockAsm: - LEAL -65536(BP), BP - MOVL BP, SI + LEAL -65536(SI), SI + MOVL SI, DI MOVW $0x001d, (AX) - MOVW BP, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm repeat_four_match_repeat_encodeBlockAsm: - LEAL -256(BP), BP + LEAL -256(SI), SI MOVW $0x0019, (AX) - MOVW BP, 2(AX) + MOVW SI, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm repeat_three_match_repeat_encodeBlockAsm: - LEAL -4(BP), BP + LEAL -4(SI), SI MOVW $0x0015, (AX) - MOVB BP, 2(AX) + MOVB SI, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_match_repeat_encodeBlockAsm: - SHLL $0x02, BP - ORL $0x01, BP - MOVW BP, (AX) + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_match_repeat_encodeBlockAsm: - XORQ DI, DI - LEAL 1(DI)(BP*4), BP - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm repeat_as_copy_encodeBlockAsm: // emitCopy - CMPL SI, $0x00010000 + CMPL DI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm four_bytes_loop_back_repeat_as_copy_encodeBlockAsm: - CMPL BP, $0x40 + CMPL SI, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm MOVB $0xff, (AX) - MOVL SI, 1(AX) - LEAL -64(BP), BP + MOVL DI, 1(AX) + LEAL -64(SI), SI ADDQ $0x05, AX - CMPL BP, $0x04 + CMPL SI, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm // emitRepeat emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: - MOVL BP, DI - LEAL -4(BP), BP - CMPL DI, $0x08 + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: - CMPL BP, $0x00000104 + CMPL SI, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL BP, $0x00010100 + CMPL SI, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL BP, $0x0100ffff + CMPL SI, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy - LEAL -16842747(BP), BP + LEAL -16842747(SI), SI MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -408,85 +389,85 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -65536(BP), BP - MOVL BP, SI + LEAL -65536(SI), SI + MOVL SI, DI MOVW $0x001d, (AX) - MOVW BP, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -256(BP), BP + LEAL -256(SI), SI MOVW $0x0019, (AX) - MOVW BP, 2(AX) + MOVW SI, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -4(BP), BP + LEAL -4(SI), SI MOVW $0x0015, (AX) - MOVB BP, 2(AX) + MOVB SI, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy: - SHLL $0x02, BP - ORL $0x01, BP - MOVW BP, (AX) + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: - XORQ DI, DI - LEAL 1(DI)(BP*4), BP - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm four_bytes_remain_repeat_as_copy_encodeBlockAsm: - TESTL BP, BP + TESTL SI, SI JZ repeat_end_emit_encodeBlockAsm MOVB $0x03, BL - LEAL -4(BX)(BP*4), BP - MOVB BP, (AX) - MOVL SI, 1(AX) + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVL DI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm two_byte_offset_repeat_as_copy_encodeBlockAsm: - CMPL BP, $0x40 + CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BP), BP + MOVW DI, 1(AX) + LEAL -60(SI), SI ADDQ $0x03, AX // emitRepeat emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: - MOVL BP, DI - LEAL -4(BP), BP - CMPL DI, $0x08 + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: - CMPL BP, $0x00000104 + CMPL SI, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL BP, $0x00010100 + CMPL SI, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL BP, $0x0100ffff + CMPL SI, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short - LEAL -16842747(BP), BP + LEAL -16842747(SI), SI MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -494,68 +475,68 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -65536(BP), BP - MOVL BP, SI + LEAL -65536(SI), SI + MOVL SI, DI MOVW $0x001d, (AX) - MOVW BP, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -256(BP), BP + LEAL -256(SI), SI MOVW $0x0019, (AX) - MOVW BP, 2(AX) + MOVW SI, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -4(BP), BP + LEAL -4(SI), SI MOVW $0x0015, (AX) - MOVB BP, 2(AX) + MOVB SI, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short: - SHLL $0x02, BP - ORL $0x01, BP - MOVW BP, (AX) + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(BP*4), BP - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm JMP two_byte_offset_repeat_as_copy_encodeBlockAsm two_byte_offset_short_repeat_as_copy_encodeBlockAsm: - CMPL BP, $0x0c + CMPL SI, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm MOVB $0x01, BL - LEAL -16(BX)(BP*4), BP - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm emit_copy_three_repeat_as_copy_encodeBlockAsm: MOVB $0x02, BL - LEAL -4(BX)(BP*4), BP - MOVB BP, (AX) - MOVW SI, 1(AX) + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm: @@ -563,16 +544,16 @@ repeat_end_emit_encodeBlockAsm: JMP search_loop_encodeBlockAsm no_repeat_found_encodeBlockAsm: - CMPL (DX)(BP*1), SI + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBlockAsm - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BP - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BP*1), SI + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm MOVL 20(SP), CX JMP search_loop_encodeBlockAsm @@ -582,279 +563,260 @@ candidate3_match_encodeBlockAsm: JMP candidate_match_encodeBlockAsm candidate2_match_encodeBlockAsm: - MOVL R8, 24(SP)(R9*4) + MOVL R9, 24(SP)(R10*4) INCL CX - MOVL DI, BP + MOVL R8, SI candidate_match_encodeBlockAsm: - MOVL 12(SP), SI - TESTL BP, BP + MOVL 12(SP), DI + TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm match_extend_back_loop_encodeBlockAsm: - CMPL CX, SI + CMPL CX, DI JLE match_extend_back_end_encodeBlockAsm - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(CX*1), DI - CMPB BL, DI + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 JNE match_extend_back_end_encodeBlockAsm LEAL -1(CX), CX - DECL BP + DECL SI JZ match_extend_back_end_encodeBlockAsm JMP match_extend_back_loop_encodeBlockAsm match_extend_back_end_encodeBlockAsm: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 5(AX)(SI*1), SI - CMPQ SI, (SP) + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 5(AX)(DI*1), DI + CMPQ DI, (SP) JL match_dst_size_check_encodeBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI JEQ emit_literal_done_match_emit_encodeBlockAsm - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c JLT one_byte_match_emit_encodeBlockAsm - CMPL DI, $0x00000100 + CMPL R8, $0x00000100 JLT two_bytes_match_emit_encodeBlockAsm - CMPL DI, $0x00010000 + CMPL R8, $0x00010000 JLT three_bytes_match_emit_encodeBlockAsm - CMPL DI, $0x01000000 + CMPL R8, $0x01000000 JLT four_bytes_match_emit_encodeBlockAsm MOVB $0xfc, (AX) - MOVL DI, 1(AX) + MOVL R8, 1(AX) ADDQ $0x05, AX JMP memmove_long_match_emit_encodeBlockAsm four_bytes_match_emit_encodeBlockAsm: - MOVL DI, R9 - SHRL $0x10, R9 + MOVL R8, R10 + SHRL $0x10, R10 MOVB $0xf8, (AX) - MOVW DI, 1(AX) - MOVB R9, 3(AX) + MOVW R8, 1(AX) + MOVB R10, 3(AX) ADDQ $0x04, AX JMP memmove_long_match_emit_encodeBlockAsm three_bytes_match_emit_encodeBlockAsm: MOVB $0xf4, (AX) - MOVW DI, 1(AX) + MOVW R8, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBlockAsm two_bytes_match_emit_encodeBlockAsm: MOVB $0xf0, (AX) - MOVB DI, 1(AX) + MOVB R8, 1(AX) ADDQ $0x02, AX - CMPL DI, $0x40 + CMPL R8, $0x40 JL memmove_match_emit_encodeBlockAsm JMP memmove_long_match_emit_encodeBlockAsm one_byte_match_emit_encodeBlockAsm: - SHLB $0x02, DI - MOVB DI, (AX) + SHLB $0x02, R8 + MOVB R8, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_1or2: - MOVB (SI), R9 - MOVB -1(SI)(R8*1), SI - MOVB R9, (AX) - MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_3: - MOVW (SI), R9 - MOVB 2(SI), SI - MOVW R9, (AX) - MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_4through7: - MOVL (SI), R9 - MOVL -4(SI)(R8*1), SI - MOVL R9, (AX) - MOVL SI, -4(AX)(R8*1) +emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_encodeBlockAsm: - MOVQ DI, AX + MOVQ R8, AX JMP emit_literal_done_match_emit_encodeBlockAsm memmove_long_match_emit_encodeBlockAsm: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 + DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX emit_literal_done_match_emit_encodeBlockAsm: match_nolit_loop_encodeBlockAsm: - MOVL CX, SI - SUBL BP, SI - MOVL SI, 16(SP) + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), BP + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI // matchLen - XORL R9, R9 - CMPL SI, $0x08 + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm matchlen_loopback_match_nolit_encodeBlockAsm: - MOVQ (DI)(R9*1), R8 - XORQ (BP)(R9*1), R8 - TESTQ R8, R8 + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm - BSFQ R8, R8 - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeBlockAsm matchlen_loop_match_nolit_encodeBlockAsm: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm matchlen_single_match_nolit_encodeBlockAsm: - TESTL SI, SI + TESTL DI, DI JZ match_nolit_end_encodeBlockAsm matchlen_single_loopback_match_nolit_encodeBlockAsm: - MOVB (DI)(R9*1), R8 - CMPB (BP)(R9*1), R8 + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm - LEAL 1(R9), R9 - DECL SI + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm match_nolit_end_encodeBlockAsm: - ADDL R9, CX - MOVL 16(SP), BP - ADDL $0x04, R9 + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 MOVL CX, 12(SP) // emitCopy - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm four_bytes_loop_back_match_nolit_encodeBlockAsm: - CMPL R9, $0x40 + CMPL R10, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(R9), R9 + MOVL SI, 1(AX) + LEAL -64(R10), R10 ADDQ $0x05, AX - CMPL R9, $0x04 + CMPL R10, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm // emitRepeat emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy - CMPL SI, $0x0c + CMPL DI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: - CMPL R9, $0x00000104 + CMPL R10, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy - CMPL R9, $0x00010100 + CMPL R10, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy - CMPL R9, $0x0100ffff + CMPL R10, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy - LEAL -16842747(R9), R9 + LEAL -16842747(R10), R10 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -862,85 +824,85 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy repeat_five_match_nolit_encodeBlockAsm_emit_copy: - LEAL -65536(R9), R9 - MOVL R9, BP + LEAL -65536(R10), R10 + MOVL R10, SI MOVW $0x001d, (AX) - MOVW R9, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_four_match_nolit_encodeBlockAsm_emit_copy: - LEAL -256(R9), R9 + LEAL -256(R10), R10 MOVW $0x0019, (AX) - MOVW R9, 2(AX) + MOVW R10, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_three_match_nolit_encodeBlockAsm_emit_copy: - LEAL -4(R9), R9 + LEAL -4(R10), R10 MOVW $0x0015, (AX) - MOVB R9, 2(AX) + MOVB R10, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_match_nolit_encodeBlockAsm_emit_copy: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm JMP four_bytes_loop_back_match_nolit_encodeBlockAsm four_bytes_remain_match_nolit_encodeBlockAsm: - TESTL R9, R9 + TESTL R10, R10 JZ match_nolit_emitcopy_end_encodeBlockAsm MOVB $0x03, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) - MOVL BP, 1(AX) + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm two_byte_offset_match_nolit_encodeBlockAsm: - CMPL R9, $0x40 + CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(R9), R9 + MOVW SI, 1(AX) + LEAL -60(R10), R10 ADDQ $0x03, AX // emitRepeat emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short - CMPL SI, $0x0c + CMPL DI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: - CMPL R9, $0x00000104 + CMPL R10, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short - CMPL R9, $0x00010100 + CMPL R10, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short - CMPL R9, $0x0100ffff + CMPL R10, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short - LEAL -16842747(R9), R9 + LEAL -16842747(R10), R10 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -948,96 +910,96 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short repeat_five_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -65536(R9), R9 - MOVL R9, BP + LEAL -65536(R10), R10 + MOVL R10, SI MOVW $0x001d, (AX) - MOVW R9, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_four_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -256(R9), R9 + LEAL -256(R10), R10 MOVW $0x0019, (AX) - MOVW R9, 2(AX) + MOVW R10, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_three_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -4(R9), R9 + LEAL -4(R10), R10 MOVW $0x0015, (AX) - MOVB R9, 2(AX) + MOVB R10, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_match_nolit_encodeBlockAsm_emit_copy_short: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm JMP two_byte_offset_match_nolit_encodeBlockAsm two_byte_offset_short_match_nolit_encodeBlockAsm: - CMPL R9, $0x0c + CMPL R10, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm MOVB $0x01, BL - LEAL -16(BX)(R9*4), R9 - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm emit_copy_three_match_nolit_encodeBlockAsm: MOVB $0x02, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) - MOVW BP, 1(AX) + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeBlockAsm: CMPL CX, 8(SP) JGE emit_remainder_encodeBlockAsm - MOVQ -2(DX)(CX*1), SI + MOVQ -2(DX)(CX*1), DI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsm: - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BP - SHLQ $0x10, DI - IMULQ R8, DI - SHRQ $0x32, DI - SHLQ $0x10, BP - IMULQ R8, BP - SHRQ $0x32, BP - LEAL -2(CX), R8 - LEAQ 24(SP)(BP*4), R9 - MOVL (R9), BP - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BP*1), SI + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x32, R8 + SHLQ $0x10, SI + IMULQ R9, SI + SHRQ $0x32, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI JEQ match_nolit_loop_encodeBlockAsm INCL CX JMP search_loop_encodeBlockAsm @@ -1056,11 +1018,11 @@ emit_remainder_ok_encodeBlockAsm: MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBlockAsm - MOVL CX, BP + MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX - SUBL BX, BP - LEAL -1(BP), DX + SUBL BX, SI + LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBlockAsm CMPL DX, $0x00000100 @@ -1103,46 +1065,27 @@ one_byte_emit_remainder_encodeBlockAsm: ADDQ $0x01, AX memmove_emit_remainder_encodeBlockAsm: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7 + JLE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2: - MOVB (CX), BP - MOVB -1(CX)(BX*1), CL - MOVB BP, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3: - MOVW (CX), BP - MOVB 2(CX), CL - MOVW BP, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7: - MOVL (CX), BP - MOVL -4(CX)(BX*1), CX - MOVL BP, (AX) - MOVL CX, -4(AX)(BX*1) +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm @@ -1168,43 +1111,43 @@ memmove_end_copy_emit_remainder_encodeBlockAsm: JMP emit_literal_done_emit_remainder_encodeBlockAsm memmove_long_emit_remainder_encodeBlockAsm: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 - MOVQ BX, SI - SHRQ $0x05, SI - MOVQ AX, BP - ANDL $0x0000001f, BP - MOVQ $0x00000040, DI - SUBQ BP, DI - DECQ SI + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(CX)(DI*1), BP - LEAQ -32(AX)(DI*1), R8 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back: - MOVOU (BP), X4 - MOVOU 16(BP), X5 - MOVOA X4, (R8) - MOVOA X5, 16(R8) + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI ADDQ $0x20, R8 - ADDQ $0x20, BP - ADDQ $0x20, DI - DECQ SI + DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(CX)(DI*1), X4 - MOVOU -16(CX)(DI*1), X5 - MOVOA X4, -32(AX)(DI*1) - MOVOA X5, -16(AX)(DI*1) - ADDQ $0x20, DI - CMPQ BX, DI + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) @@ -1240,9 +1183,9 @@ zero_loop_encodeBlockAsm4MB: JNZ zero_loop_encodeBlockAsm4MB MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX - LEAQ -5(CX), DX - LEAQ -8(CX), BP - MOVL BP, 8(SP) + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -1252,481 +1195,462 @@ zero_loop_encodeBlockAsm4MB: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm4MB: - MOVQ (DX)(CX*1), SI - MOVL CX, BP - SUBL 12(SP), BP - SHRL $0x06, BP - LEAL 4(CX)(BP*1), BP - CMPL BP, 8(SP) + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) JGE emit_remainder_encodeBlockAsm4MB - MOVL BP, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 SHLQ $0x10, R10 - IMULQ R8, R10 + IMULQ R9, R10 SHRQ $0x32, R10 - MOVL 24(SP)(R9*4), BP - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 + SHLQ $0x10, R11 + IMULQ R9, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm4MB - LEAL 1(CX), SI - MOVL 12(SP), DI - MOVL SI, BP - SUBL 16(SP), BP + LEAL 1(CX), DI + MOVL 12(SP), R8 + MOVL DI, SI + SUBL 16(SP), SI JZ repeat_extend_back_end_encodeBlockAsm4MB repeat_extend_back_loop_encodeBlockAsm4MB: - CMPL SI, DI + CMPL DI, R8 JLE repeat_extend_back_end_encodeBlockAsm4MB - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(SI*1), R8 - CMPB BL, R8 + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 JNE repeat_extend_back_end_encodeBlockAsm4MB - LEAL -1(SI), SI - DECL BP + LEAL -1(DI), DI + DECL SI JNZ repeat_extend_back_loop_encodeBlockAsm4MB repeat_extend_back_end_encodeBlockAsm4MB: - MOVL 12(SP), BP - CMPL BP, SI + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, R8 - LEAL -1(R8), BP - CMPL BP, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c JLT one_byte_repeat_emit_encodeBlockAsm4MB - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsm4MB - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JLT three_bytes_repeat_emit_encodeBlockAsm4MB - MOVL BP, R10 - SHRL $0x10, R10 + MOVL SI, R11 + SHRL $0x10, R11 MOVB $0xf8, (AX) - MOVW BP, 1(AX) - MOVB R10, 3(AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) ADDQ $0x04, AX JMP memmove_long_repeat_emit_encodeBlockAsm4MB three_bytes_repeat_emit_encodeBlockAsm4MB: MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeBlockAsm4MB two_bytes_repeat_emit_encodeBlockAsm4MB: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_repeat_emit_encodeBlockAsm4MB JMP memmove_long_repeat_emit_encodeBlockAsm4MB one_byte_repeat_emit_encodeBlockAsm4MB: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm4MB: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8 + CMPQ R9, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_1or2: - MOVB (R9), R10 - MOVB -1(R9)(R8*1), R9 - MOVB R10, (AX) - MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_3: - MOVW (R9), R10 - MOVB 2(R9), R9 - MOVW R10, (AX) - MOVB R9, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_repeat_emit_encodeBlockAsm4MB: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB memmove_long_repeat_emit_encodeBlockAsm4MB: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 + LEAQ -32(R10)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 + DECQ R12 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 + MOVOU -32(R10)(R13*1), X4 + MOVOU -16(R10)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R9, R13 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX emit_literal_done_repeat_emit_encodeBlockAsm4MB: ADDL $0x05, CX - MOVL CX, BP - SUBL 16(SP), BP - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(BP*1), BP + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R9 + SUBL CX, R9 + LEAQ (DX)(CX*1), R10 + LEAQ (DX)(SI*1), SI // matchLen - XORL R11, R11 - CMPL R8, $0x08 + XORL R12, R12 + CMPL R9, $0x08 JL matchlen_single_repeat_extend_encodeBlockAsm4MB matchlen_loopback_repeat_extend_encodeBlockAsm4MB: - MOVQ (R9)(R11*1), R10 - XORQ (BP)(R11*1), R10 - TESTQ R10, R10 + MOVQ (R10)(R12*1), R11 + XORQ (SI)(R12*1), R11 + TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 JMP repeat_extend_forward_end_encodeBlockAsm4MB matchlen_loop_repeat_extend_encodeBlockAsm4MB: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 + LEAL -8(R9), R9 + LEAL 8(R12), R12 + CMPL R9, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB matchlen_single_repeat_extend_encodeBlockAsm4MB: - TESTL R8, R8 + TESTL R9, R9 JZ repeat_extend_forward_end_encodeBlockAsm4MB matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB: - MOVB (R9)(R11*1), R10 - CMPB (BP)(R11*1), R10 + MOVB (R10)(R12*1), R11 + CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm4MB - LEAL 1(R11), R11 - DECL R8 + LEAL 1(R12), R12 + DECL R9 JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB repeat_extend_forward_end_encodeBlockAsm4MB: - ADDL R11, CX - MOVL CX, BP - SUBL SI, BP - MOVL 16(SP), SI - TESTL DI, DI + ADDL R12, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsm4MB // emitRepeat - MOVL BP, DI - LEAL -4(BP), BP - CMPL DI, $0x08 + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm4MB - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsm4MB cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB: - CMPL BP, $0x00000104 + CMPL SI, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm4MB - CMPL BP, $0x00010100 + CMPL SI, $0x00010100 JLT repeat_four_match_repeat_encodeBlockAsm4MB - LEAL -65536(BP), BP - MOVL BP, SI + LEAL -65536(SI), SI + MOVL SI, DI MOVW $0x001d, (AX) - MOVW BP, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_four_match_repeat_encodeBlockAsm4MB: - LEAL -256(BP), BP + LEAL -256(SI), SI MOVW $0x0019, (AX) - MOVW BP, 2(AX) + MOVW SI, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_three_match_repeat_encodeBlockAsm4MB: - LEAL -4(BP), BP + LEAL -4(SI), SI MOVW $0x0015, (AX) - MOVB BP, 2(AX) + MOVB SI, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_match_repeat_encodeBlockAsm4MB: - SHLL $0x02, BP - ORL $0x01, BP - MOVW BP, (AX) + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_offset_match_repeat_encodeBlockAsm4MB: - XORQ DI, DI - LEAL 1(DI)(BP*4), BP - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_as_copy_encodeBlockAsm4MB: // emitCopy - CMPL SI, $0x00010000 + CMPL DI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm4MB four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB: - CMPL BP, $0x40 + CMPL SI, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB MOVB $0xff, (AX) - MOVL SI, 1(AX) - LEAL -64(BP), BP + MOVL DI, 1(AX) + LEAL -64(SI), SI ADDQ $0x05, AX - CMPL BP, $0x04 + CMPL SI, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB // emitRepeat - MOVL BP, DI - LEAL -4(BP), BP - CMPL DI, $0x08 + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - CMPL BP, $0x00000104 + CMPL SI, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy - CMPL BP, $0x00010100 + CMPL SI, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy - LEAL -65536(BP), BP - MOVL BP, SI + LEAL -65536(SI), SI + MOVL SI, DI MOVW $0x001d, (AX) - MOVW BP, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - LEAL -256(BP), BP + LEAL -256(SI), SI MOVW $0x0019, (AX) - MOVW BP, 2(AX) + MOVW SI, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - LEAL -4(BP), BP + LEAL -4(SI), SI MOVW $0x0015, (AX) - MOVB BP, 2(AX) + MOVB SI, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - SHLL $0x02, BP - ORL $0x01, BP - MOVW BP, (AX) + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - XORQ DI, DI - LEAL 1(DI)(BP*4), BP - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB: - TESTL BP, BP + TESTL SI, SI JZ repeat_end_emit_encodeBlockAsm4MB MOVB $0x03, BL - LEAL -4(BX)(BP*4), BP - MOVB BP, (AX) - MOVL SI, 1(AX) + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVL DI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm4MB two_byte_offset_repeat_as_copy_encodeBlockAsm4MB: - CMPL BP, $0x40 + CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BP), BP + MOVW DI, 1(AX) + LEAL -60(SI), SI ADDQ $0x03, AX // emitRepeat - MOVL BP, DI - LEAL -4(BP), BP - CMPL DI, $0x08 + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - CMPL BP, $0x00000104 + CMPL SI, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - CMPL BP, $0x00010100 + CMPL SI, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - LEAL -65536(BP), BP - MOVL BP, SI + LEAL -65536(SI), SI + MOVL SI, DI MOVW $0x001d, (AX) - MOVW BP, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - LEAL -256(BP), BP + LEAL -256(SI), SI MOVW $0x0019, (AX) - MOVW BP, 2(AX) + MOVW SI, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - LEAL -4(BP), BP + LEAL -4(SI), SI MOVW $0x0015, (AX) - MOVB BP, 2(AX) + MOVB SI, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - SHLL $0x02, BP - ORL $0x01, BP - MOVW BP, (AX) + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(BP*4), BP - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB: - CMPL BP, $0x0c + CMPL SI, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB MOVB $0x01, BL - LEAL -16(BX)(BP*4), BP - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB emit_copy_three_repeat_as_copy_encodeBlockAsm4MB: MOVB $0x02, BL - LEAL -4(BX)(BP*4), BP - MOVB BP, (AX) - MOVW SI, 1(AX) + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm4MB: @@ -1734,16 +1658,16 @@ repeat_end_emit_encodeBlockAsm4MB: JMP search_loop_encodeBlockAsm4MB no_repeat_found_encodeBlockAsm4MB: - CMPL (DX)(BP*1), SI + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBlockAsm4MB - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BP - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm4MB - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BP*1), SI + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm4MB MOVL 20(SP), CX JMP search_loop_encodeBlockAsm4MB @@ -1753,432 +1677,413 @@ candidate3_match_encodeBlockAsm4MB: JMP candidate_match_encodeBlockAsm4MB candidate2_match_encodeBlockAsm4MB: - MOVL R8, 24(SP)(R9*4) + MOVL R9, 24(SP)(R10*4) INCL CX - MOVL DI, BP + MOVL R8, SI candidate_match_encodeBlockAsm4MB: - MOVL 12(SP), SI - TESTL BP, BP + MOVL 12(SP), DI + TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm4MB match_extend_back_loop_encodeBlockAsm4MB: - CMPL CX, SI + CMPL CX, DI JLE match_extend_back_end_encodeBlockAsm4MB - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(CX*1), DI - CMPB BL, DI + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 JNE match_extend_back_end_encodeBlockAsm4MB LEAL -1(CX), CX - DECL BP + DECL SI JZ match_extend_back_end_encodeBlockAsm4MB JMP match_extend_back_loop_encodeBlockAsm4MB match_extend_back_end_encodeBlockAsm4MB: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI - CMPQ SI, (SP) + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 4(AX)(DI*1), DI + CMPQ DI, (SP) JL match_dst_size_check_encodeBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm4MB: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI JEQ emit_literal_done_match_emit_encodeBlockAsm4MB - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c - JLT one_byte_match_emit_encodeBlockAsm4MB - CMPL DI, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm4MB - CMPL DI, $0x00010000 - JLT three_bytes_match_emit_encodeBlockAsm4MB MOVL DI, R9 - SHRL $0x10, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeBlockAsm4MB + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm4MB + CMPL R8, $0x00010000 + JLT three_bytes_match_emit_encodeBlockAsm4MB + MOVL R8, R10 + SHRL $0x10, R10 MOVB $0xf8, (AX) - MOVW DI, 1(AX) - MOVB R9, 3(AX) + MOVW R8, 1(AX) + MOVB R10, 3(AX) ADDQ $0x04, AX JMP memmove_long_match_emit_encodeBlockAsm4MB three_bytes_match_emit_encodeBlockAsm4MB: MOVB $0xf4, (AX) - MOVW DI, 1(AX) + MOVW R8, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBlockAsm4MB two_bytes_match_emit_encodeBlockAsm4MB: MOVB $0xf0, (AX) - MOVB DI, 1(AX) + MOVB R8, 1(AX) ADDQ $0x02, AX - CMPL DI, $0x40 + CMPL R8, $0x40 JL memmove_match_emit_encodeBlockAsm4MB JMP memmove_long_match_emit_encodeBlockAsm4MB one_byte_match_emit_encodeBlockAsm4MB: - SHLB $0x02, DI - MOVB DI, (AX) + SHLB $0x02, R8 + MOVB R8, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm4MB: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_1or2: - MOVB (SI), R9 - MOVB -1(SI)(R8*1), SI - MOVB R9, (AX) - MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_3: - MOVW (SI), R9 - MOVB 2(SI), SI - MOVW R9, (AX) - MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_4through7: - MOVL (SI), R9 - MOVL -4(SI)(R8*1), SI - MOVL R9, (AX) - MOVL SI, -4(AX)(R8*1) +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm4MB emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm4MB emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm4MB emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_encodeBlockAsm4MB: - MOVQ DI, AX + MOVQ R8, AX JMP emit_literal_done_match_emit_encodeBlockAsm4MB memmove_long_match_emit_encodeBlockAsm4MB: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 + DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX emit_literal_done_match_emit_encodeBlockAsm4MB: match_nolit_loop_encodeBlockAsm4MB: - MOVL CX, SI - SUBL BP, SI - MOVL SI, 16(SP) + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), BP + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI // matchLen - XORL R9, R9 - CMPL SI, $0x08 + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm4MB matchlen_loopback_match_nolit_encodeBlockAsm4MB: - MOVQ (DI)(R9*1), R8 - XORQ (BP)(R9*1), R8 - TESTQ R8, R8 + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm4MB - BSFQ R8, R8 - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeBlockAsm4MB matchlen_loop_match_nolit_encodeBlockAsm4MB: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB matchlen_single_match_nolit_encodeBlockAsm4MB: - TESTL SI, SI + TESTL DI, DI JZ match_nolit_end_encodeBlockAsm4MB matchlen_single_loopback_match_nolit_encodeBlockAsm4MB: - MOVB (DI)(R9*1), R8 - CMPB (BP)(R9*1), R8 + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm4MB - LEAL 1(R9), R9 - DECL SI + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm4MB match_nolit_end_encodeBlockAsm4MB: - ADDL R9, CX - MOVL 16(SP), BP - ADDL $0x04, R9 + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 MOVL CX, 12(SP) // emitCopy - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm4MB four_bytes_loop_back_match_nolit_encodeBlockAsm4MB: - CMPL R9, $0x40 + CMPL R10, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm4MB MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(R9), R9 + MOVL SI, 1(AX) + LEAL -64(R10), R10 ADDQ $0x05, AX - CMPL R9, $0x04 + CMPL R10, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm4MB // emitRepeat - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy - CMPL SI, $0x0c + CMPL DI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: - CMPL R9, $0x00000104 + CMPL R10, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy - CMPL R9, $0x00010100 + CMPL R10, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy - LEAL -65536(R9), R9 - MOVL R9, BP + LEAL -65536(R10), R10 + MOVL R10, SI MOVW $0x001d, (AX) - MOVW R9, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy: - LEAL -256(R9), R9 + LEAL -256(R10), R10 MOVW $0x0019, (AX) - MOVW R9, 2(AX) + MOVW R10, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy: - LEAL -4(R9), R9 + LEAL -4(R10), R10 MOVW $0x0015, (AX) - MOVB R9, 2(AX) + MOVB R10, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB four_bytes_remain_match_nolit_encodeBlockAsm4MB: - TESTL R9, R9 + TESTL R10, R10 JZ match_nolit_emitcopy_end_encodeBlockAsm4MB MOVB $0x03, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) - MOVL BP, 1(AX) + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB two_byte_offset_match_nolit_encodeBlockAsm4MB: - CMPL R9, $0x40 + CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(R9), R9 + MOVW SI, 1(AX) + LEAL -60(R10), R10 ADDQ $0x03, AX // emitRepeat - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short - CMPL SI, $0x0c + CMPL DI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: - CMPL R9, $0x00000104 + CMPL R10, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short - CMPL R9, $0x00010100 + CMPL R10, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short - LEAL -65536(R9), R9 - MOVL R9, BP + LEAL -65536(R10), R10 + MOVL R10, SI MOVW $0x001d, (AX) - MOVW R9, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short: - LEAL -256(R9), R9 + LEAL -256(R10), R10 MOVW $0x0019, (AX) - MOVW R9, 2(AX) + MOVW R10, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short: - LEAL -4(R9), R9 + LEAL -4(R10), R10 MOVW $0x0015, (AX) - MOVB R9, 2(AX) + MOVB R10, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB JMP two_byte_offset_match_nolit_encodeBlockAsm4MB two_byte_offset_short_match_nolit_encodeBlockAsm4MB: - CMPL R9, $0x0c + CMPL R10, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm4MB - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm4MB MOVB $0x01, BL - LEAL -16(BX)(R9*4), R9 - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB emit_copy_three_match_nolit_encodeBlockAsm4MB: MOVB $0x02, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) - MOVW BP, 1(AX) + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeBlockAsm4MB: CMPL CX, 8(SP) JGE emit_remainder_encodeBlockAsm4MB - MOVQ -2(DX)(CX*1), SI + MOVQ -2(DX)(CX*1), DI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsm4MB: - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BP - SHLQ $0x10, DI - IMULQ R8, DI - SHRQ $0x32, DI - SHLQ $0x10, BP - IMULQ R8, BP - SHRQ $0x32, BP - LEAL -2(CX), R8 - LEAQ 24(SP)(BP*4), R9 - MOVL (R9), BP - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BP*1), SI + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x32, R8 + SHLQ $0x10, SI + IMULQ R9, SI + SHRQ $0x32, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI JEQ match_nolit_loop_encodeBlockAsm4MB INCL CX JMP search_loop_encodeBlockAsm4MB @@ -2197,11 +2102,11 @@ emit_remainder_ok_encodeBlockAsm4MB: MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB - MOVL CX, BP + MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX - SUBL BX, BP - LEAL -1(BP), DX + SUBL BX, SI + LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBlockAsm4MB CMPL DX, $0x00000100 @@ -2236,46 +2141,27 @@ one_byte_emit_remainder_encodeBlockAsm4MB: ADDQ $0x01, AX memmove_emit_remainder_encodeBlockAsm4MB: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7 + JLE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2: - MOVB (CX), BP - MOVB -1(CX)(BX*1), CL - MOVB BP, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3: - MOVW (CX), BP - MOVB 2(CX), CL - MOVW BP, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7: - MOVL (CX), BP - MOVL -4(CX)(BX*1), CX - MOVL BP, (AX) - MOVL CX, -4(AX)(BX*1) +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB @@ -2301,43 +2187,43 @@ memmove_end_copy_emit_remainder_encodeBlockAsm4MB: JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB memmove_long_emit_remainder_encodeBlockAsm4MB: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 - MOVQ BX, SI - SHRQ $0x05, SI - MOVQ AX, BP - ANDL $0x0000001f, BP - MOVQ $0x00000040, DI - SUBQ BP, DI - DECQ SI + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(CX)(DI*1), BP - LEAQ -32(AX)(DI*1), R8 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back: - MOVOU (BP), X4 - MOVOU 16(BP), X5 - MOVOA X4, (R8) - MOVOA X5, 16(R8) + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI ADDQ $0x20, R8 - ADDQ $0x20, BP - ADDQ $0x20, DI - DECQ SI + DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(CX)(DI*1), X4 - MOVOU -16(CX)(DI*1), X5 - MOVOA X4, -32(AX)(DI*1) - MOVOA X5, -16(AX)(DI*1) - ADDQ $0x20, DI - CMPQ BX, DI + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) @@ -2373,9 +2259,9 @@ zero_loop_encodeBlockAsm12B: JNZ zero_loop_encodeBlockAsm12B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX - LEAQ -5(CX), DX - LEAQ -8(CX), BP - MOVL BP, 8(SP) + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -2385,366 +2271,347 @@ zero_loop_encodeBlockAsm12B: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm12B: - MOVQ (DX)(CX*1), SI - MOVL CX, BP - SUBL 12(SP), BP - SHRL $0x05, BP - LEAL 4(CX)(BP*1), BP - CMPL BP, 8(SP) + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) JGE emit_remainder_encodeBlockAsm12B - MOVL BP, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x18, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x000000cf1bbcdcbb, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 SHLQ $0x18, R10 - IMULQ R8, R10 + IMULQ R9, R10 SHRQ $0x34, R10 - MOVL 24(SP)(R9*4), BP - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x18, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 + SHLQ $0x18, R11 + IMULQ R9, R11 + SHRQ $0x34, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x18, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm12B - LEAL 1(CX), SI - MOVL 12(SP), DI - MOVL SI, BP - SUBL 16(SP), BP + LEAL 1(CX), DI + MOVL 12(SP), R8 + MOVL DI, SI + SUBL 16(SP), SI JZ repeat_extend_back_end_encodeBlockAsm12B repeat_extend_back_loop_encodeBlockAsm12B: - CMPL SI, DI + CMPL DI, R8 JLE repeat_extend_back_end_encodeBlockAsm12B - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(SI*1), R8 - CMPB BL, R8 + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 JNE repeat_extend_back_end_encodeBlockAsm12B - LEAL -1(SI), SI - DECL BP + LEAL -1(DI), DI + DECL SI JNZ repeat_extend_back_loop_encodeBlockAsm12B repeat_extend_back_end_encodeBlockAsm12B: - MOVL 12(SP), BP - CMPL BP, SI + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, R8 - LEAL -1(R8), BP - CMPL BP, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c JLT one_byte_repeat_emit_encodeBlockAsm12B - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsm12B MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeBlockAsm12B two_bytes_repeat_emit_encodeBlockAsm12B: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_repeat_emit_encodeBlockAsm12B JMP memmove_long_repeat_emit_encodeBlockAsm12B one_byte_repeat_emit_encodeBlockAsm12B: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm12B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8 + CMPQ R9, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2: - MOVB (R9), R10 - MOVB -1(R9)(R8*1), R9 - MOVB R10, (AX) - MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3: - MOVW (R9), R10 - MOVB 2(R9), R9 - MOVW R10, (AX) - MOVB R9, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_repeat_emit_encodeBlockAsm12B: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_repeat_emit_encodeBlockAsm12B memmove_long_repeat_emit_encodeBlockAsm12B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 + LEAQ -32(R10)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 + DECQ R12 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 + MOVOU -32(R10)(R13*1), X4 + MOVOU -16(R10)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R9, R13 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX emit_literal_done_repeat_emit_encodeBlockAsm12B: ADDL $0x05, CX - MOVL CX, BP - SUBL 16(SP), BP - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(BP*1), BP + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R9 + SUBL CX, R9 + LEAQ (DX)(CX*1), R10 + LEAQ (DX)(SI*1), SI // matchLen - XORL R11, R11 - CMPL R8, $0x08 + XORL R12, R12 + CMPL R9, $0x08 JL matchlen_single_repeat_extend_encodeBlockAsm12B matchlen_loopback_repeat_extend_encodeBlockAsm12B: - MOVQ (R9)(R11*1), R10 - XORQ (BP)(R11*1), R10 - TESTQ R10, R10 + MOVQ (R10)(R12*1), R11 + XORQ (SI)(R12*1), R11 + TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm12B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 JMP repeat_extend_forward_end_encodeBlockAsm12B matchlen_loop_repeat_extend_encodeBlockAsm12B: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 + LEAL -8(R9), R9 + LEAL 8(R12), R12 + CMPL R9, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B matchlen_single_repeat_extend_encodeBlockAsm12B: - TESTL R8, R8 + TESTL R9, R9 JZ repeat_extend_forward_end_encodeBlockAsm12B matchlen_single_loopback_repeat_extend_encodeBlockAsm12B: - MOVB (R9)(R11*1), R10 - CMPB (BP)(R11*1), R10 + MOVB (R10)(R12*1), R11 + CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm12B - LEAL 1(R11), R11 - DECL R8 + LEAL 1(R12), R12 + DECL R9 JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm12B repeat_extend_forward_end_encodeBlockAsm12B: - ADDL R11, CX - MOVL CX, BP - SUBL SI, BP - MOVL 16(SP), SI - TESTL DI, DI + ADDL R12, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsm12B // emitRepeat - MOVL BP, DI - LEAL -4(BP), BP - CMPL DI, $0x08 + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm12B - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsm12B cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: - CMPL BP, $0x00000104 + CMPL SI, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm12B - LEAL -256(BP), BP + LEAL -256(SI), SI MOVW $0x0019, (AX) - MOVW BP, 2(AX) + MOVW SI, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_three_match_repeat_encodeBlockAsm12B: - LEAL -4(BP), BP + LEAL -4(SI), SI MOVW $0x0015, (AX) - MOVB BP, 2(AX) + MOVB SI, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_match_repeat_encodeBlockAsm12B: - SHLL $0x02, BP - ORL $0x01, BP - MOVW BP, (AX) + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_offset_match_repeat_encodeBlockAsm12B: - XORQ DI, DI - LEAL 1(DI)(BP*4), BP - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_as_copy_encodeBlockAsm12B: // emitCopy two_byte_offset_repeat_as_copy_encodeBlockAsm12B: - CMPL BP, $0x40 + CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BP), BP + MOVW DI, 1(AX) + LEAL -60(SI), SI ADDQ $0x03, AX // emitRepeat - MOVL BP, DI - LEAL -4(BP), BP - CMPL DI, $0x08 + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - CMPL BP, $0x00000104 + CMPL SI, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - LEAL -256(BP), BP + LEAL -256(SI), SI MOVW $0x0019, (AX) - MOVW BP, 2(AX) + MOVW SI, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - LEAL -4(BP), BP + LEAL -4(SI), SI MOVW $0x0015, (AX) - MOVB BP, 2(AX) + MOVB SI, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - SHLL $0x02, BP - ORL $0x01, BP - MOVW BP, (AX) + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(BP*4), BP - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: - CMPL BP, $0x0c + CMPL SI, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B MOVB $0x01, BL - LEAL -16(BX)(BP*4), BP - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B emit_copy_three_repeat_as_copy_encodeBlockAsm12B: MOVB $0x02, BL - LEAL -4(BX)(BP*4), BP - MOVB BP, (AX) - MOVW SI, 1(AX) + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm12B: @@ -2752,16 +2619,16 @@ repeat_end_emit_encodeBlockAsm12B: JMP search_loop_encodeBlockAsm12B no_repeat_found_encodeBlockAsm12B: - CMPL (DX)(BP*1), SI + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBlockAsm12B - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BP - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm12B - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BP*1), SI + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm12B MOVL 20(SP), CX JMP search_loop_encodeBlockAsm12B @@ -2771,329 +2638,310 @@ candidate3_match_encodeBlockAsm12B: JMP candidate_match_encodeBlockAsm12B candidate2_match_encodeBlockAsm12B: - MOVL R8, 24(SP)(R9*4) + MOVL R9, 24(SP)(R10*4) INCL CX - MOVL DI, BP + MOVL R8, SI candidate_match_encodeBlockAsm12B: - MOVL 12(SP), SI - TESTL BP, BP + MOVL 12(SP), DI + TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm12B match_extend_back_loop_encodeBlockAsm12B: - CMPL CX, SI + CMPL CX, DI JLE match_extend_back_end_encodeBlockAsm12B - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(CX*1), DI - CMPB BL, DI + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 JNE match_extend_back_end_encodeBlockAsm12B LEAL -1(CX), CX - DECL BP + DECL SI JZ match_extend_back_end_encodeBlockAsm12B JMP match_extend_back_loop_encodeBlockAsm12B match_extend_back_end_encodeBlockAsm12B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) JL match_dst_size_check_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm12B: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI JEQ emit_literal_done_match_emit_encodeBlockAsm12B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c JLT one_byte_match_emit_encodeBlockAsm12B - CMPL DI, $0x00000100 + CMPL R8, $0x00000100 JLT two_bytes_match_emit_encodeBlockAsm12B MOVB $0xf4, (AX) - MOVW DI, 1(AX) + MOVW R8, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBlockAsm12B two_bytes_match_emit_encodeBlockAsm12B: MOVB $0xf0, (AX) - MOVB DI, 1(AX) + MOVB R8, 1(AX) ADDQ $0x02, AX - CMPL DI, $0x40 + CMPL R8, $0x40 JL memmove_match_emit_encodeBlockAsm12B JMP memmove_long_match_emit_encodeBlockAsm12B one_byte_match_emit_encodeBlockAsm12B: - SHLB $0x02, DI - MOVB DI, (AX) + SHLB $0x02, R8 + MOVB R8, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm12B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2: - MOVB (SI), R9 - MOVB -1(SI)(R8*1), SI - MOVB R9, (AX) - MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3: - MOVW (SI), R9 - MOVB 2(SI), SI - MOVW R9, (AX) - MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4through7: - MOVL (SI), R9 - MOVL -4(SI)(R8*1), SI - MOVL R9, (AX) - MOVL SI, -4(AX)(R8*1) +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_encodeBlockAsm12B: - MOVQ DI, AX + MOVQ R8, AX JMP emit_literal_done_match_emit_encodeBlockAsm12B memmove_long_match_emit_encodeBlockAsm12B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 + DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX emit_literal_done_match_emit_encodeBlockAsm12B: match_nolit_loop_encodeBlockAsm12B: - MOVL CX, SI - SUBL BP, SI - MOVL SI, 16(SP) + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), BP + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI // matchLen - XORL R9, R9 - CMPL SI, $0x08 + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm12B matchlen_loopback_match_nolit_encodeBlockAsm12B: - MOVQ (DI)(R9*1), R8 - XORQ (BP)(R9*1), R8 - TESTQ R8, R8 + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm12B - BSFQ R8, R8 - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeBlockAsm12B matchlen_loop_match_nolit_encodeBlockAsm12B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm12B matchlen_single_match_nolit_encodeBlockAsm12B: - TESTL SI, SI + TESTL DI, DI JZ match_nolit_end_encodeBlockAsm12B matchlen_single_loopback_match_nolit_encodeBlockAsm12B: - MOVB (DI)(R9*1), R8 - CMPB (BP)(R9*1), R8 + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm12B - LEAL 1(R9), R9 - DECL SI + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B match_nolit_end_encodeBlockAsm12B: - ADDL R9, CX - MOVL 16(SP), BP - ADDL $0x04, R9 + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeBlockAsm12B: - CMPL R9, $0x40 + CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(R9), R9 + MOVW SI, 1(AX) + LEAL -60(R10), R10 ADDQ $0x03, AX // emitRepeat - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL SI, $0x0c + CMPL DI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: - CMPL R9, $0x00000104 + CMPL R10, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short - LEAL -256(R9), R9 + LEAL -256(R10), R10 MOVW $0x0019, (AX) - MOVW R9, 2(AX) + MOVW R10, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: - LEAL -4(R9), R9 + LEAL -4(R10), R10 MOVW $0x0015, (AX) - MOVB R9, 2(AX) + MOVB R10, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B JMP two_byte_offset_match_nolit_encodeBlockAsm12B two_byte_offset_short_match_nolit_encodeBlockAsm12B: - CMPL R9, $0x0c + CMPL R10, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm12B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm12B MOVB $0x01, BL - LEAL -16(BX)(R9*4), R9 - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B emit_copy_three_match_nolit_encodeBlockAsm12B: MOVB $0x02, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) - MOVW BP, 1(AX) + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeBlockAsm12B: CMPL CX, 8(SP) JGE emit_remainder_encodeBlockAsm12B - MOVQ -2(DX)(CX*1), SI + MOVQ -2(DX)(CX*1), DI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsm12B: - MOVQ $0x000000cf1bbcdcbb, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BP - SHLQ $0x18, DI - IMULQ R8, DI - SHRQ $0x34, DI - SHLQ $0x18, BP - IMULQ R8, BP - SHRQ $0x34, BP - LEAL -2(CX), R8 - LEAQ 24(SP)(BP*4), R9 - MOVL (R9), BP - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BP*1), SI + MOVQ $0x000000cf1bbcdcbb, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x18, R8 + IMULQ R9, R8 + SHRQ $0x34, R8 + SHLQ $0x18, SI + IMULQ R9, SI + SHRQ $0x34, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI JEQ match_nolit_loop_encodeBlockAsm12B INCL CX JMP search_loop_encodeBlockAsm12B @@ -3112,11 +2960,11 @@ emit_remainder_ok_encodeBlockAsm12B: MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B - MOVL CX, BP + MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX - SUBL BX, BP - LEAL -1(BP), DX + SUBL BX, SI + LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBlockAsm12B CMPL DX, $0x00000100 @@ -3140,46 +2988,27 @@ one_byte_emit_remainder_encodeBlockAsm12B: ADDQ $0x01, AX memmove_emit_remainder_encodeBlockAsm12B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7 + JLE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: - MOVB (CX), BP - MOVB -1(CX)(BX*1), CL - MOVB BP, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: - MOVW (CX), BP - MOVB 2(CX), CL - MOVW BP, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7: - MOVL (CX), BP - MOVL -4(CX)(BX*1), CX - MOVL BP, (AX) - MOVL CX, -4(AX)(BX*1) +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B @@ -3205,43 +3034,43 @@ memmove_end_copy_emit_remainder_encodeBlockAsm12B: JMP emit_literal_done_emit_remainder_encodeBlockAsm12B memmove_long_emit_remainder_encodeBlockAsm12B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 - MOVQ BX, SI - SHRQ $0x05, SI - MOVQ AX, BP - ANDL $0x0000001f, BP - MOVQ $0x00000040, DI - SUBQ BP, DI - DECQ SI + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(CX)(DI*1), BP - LEAQ -32(AX)(DI*1), R8 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: - MOVOU (BP), X4 - MOVOU 16(BP), X5 - MOVOA X4, (R8) - MOVOA X5, 16(R8) + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI ADDQ $0x20, R8 - ADDQ $0x20, BP - ADDQ $0x20, DI - DECQ SI + DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(CX)(DI*1), X4 - MOVOU -16(CX)(DI*1), X5 - MOVOA X4, -32(AX)(DI*1) - MOVOA X5, -16(AX)(DI*1) - ADDQ $0x20, DI - CMPQ BX, DI + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) @@ -3277,9 +3106,9 @@ zero_loop_encodeBlockAsm10B: JNZ zero_loop_encodeBlockAsm10B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX - LEAQ -5(CX), DX - LEAQ -8(CX), BP - MOVL BP, 8(SP) + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -3289,366 +3118,347 @@ zero_loop_encodeBlockAsm10B: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm10B: - MOVQ (DX)(CX*1), SI - MOVL CX, BP - SUBL 12(SP), BP - SHRL $0x05, BP - LEAL 4(CX)(BP*1), BP - CMPL BP, 8(SP) + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) JGE emit_remainder_encodeBlockAsm10B - MOVL BP, 20(SP) - MOVQ $0x9e3779b1, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x9e3779b1, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 SHLQ $0x20, R10 - IMULQ R8, R10 + IMULQ R9, R10 SHRQ $0x36, R10 - MOVL 24(SP)(R9*4), BP - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 + SHLQ $0x20, R11 + IMULQ R9, R11 + SHRQ $0x36, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm10B - LEAL 1(CX), SI - MOVL 12(SP), DI - MOVL SI, BP - SUBL 16(SP), BP + LEAL 1(CX), DI + MOVL 12(SP), R8 + MOVL DI, SI + SUBL 16(SP), SI JZ repeat_extend_back_end_encodeBlockAsm10B repeat_extend_back_loop_encodeBlockAsm10B: - CMPL SI, DI + CMPL DI, R8 JLE repeat_extend_back_end_encodeBlockAsm10B - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(SI*1), R8 - CMPB BL, R8 + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 JNE repeat_extend_back_end_encodeBlockAsm10B - LEAL -1(SI), SI - DECL BP + LEAL -1(DI), DI + DECL SI JNZ repeat_extend_back_loop_encodeBlockAsm10B repeat_extend_back_end_encodeBlockAsm10B: - MOVL 12(SP), BP - CMPL BP, SI + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, R8 - LEAL -1(R8), BP - CMPL BP, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c JLT one_byte_repeat_emit_encodeBlockAsm10B - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsm10B MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeBlockAsm10B two_bytes_repeat_emit_encodeBlockAsm10B: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_repeat_emit_encodeBlockAsm10B JMP memmove_long_repeat_emit_encodeBlockAsm10B one_byte_repeat_emit_encodeBlockAsm10B: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm10B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8 + CMPQ R9, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_1or2: - MOVB (R9), R10 - MOVB -1(R9)(R8*1), R9 - MOVB R10, (AX) - MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_3: - MOVW (R9), R10 - MOVB 2(R9), R9 - MOVW R10, (AX) - MOVB R9, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_repeat_emit_encodeBlockAsm10B: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_repeat_emit_encodeBlockAsm10B memmove_long_repeat_emit_encodeBlockAsm10B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 + LEAQ -32(R10)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 + DECQ R12 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 + MOVOU -32(R10)(R13*1), X4 + MOVOU -16(R10)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R9, R13 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX emit_literal_done_repeat_emit_encodeBlockAsm10B: ADDL $0x05, CX - MOVL CX, BP - SUBL 16(SP), BP - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(BP*1), BP + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R9 + SUBL CX, R9 + LEAQ (DX)(CX*1), R10 + LEAQ (DX)(SI*1), SI // matchLen - XORL R11, R11 - CMPL R8, $0x08 + XORL R12, R12 + CMPL R9, $0x08 JL matchlen_single_repeat_extend_encodeBlockAsm10B matchlen_loopback_repeat_extend_encodeBlockAsm10B: - MOVQ (R9)(R11*1), R10 - XORQ (BP)(R11*1), R10 - TESTQ R10, R10 + MOVQ (R10)(R12*1), R11 + XORQ (SI)(R12*1), R11 + TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm10B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 JMP repeat_extend_forward_end_encodeBlockAsm10B matchlen_loop_repeat_extend_encodeBlockAsm10B: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 + LEAL -8(R9), R9 + LEAL 8(R12), R12 + CMPL R9, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B matchlen_single_repeat_extend_encodeBlockAsm10B: - TESTL R8, R8 + TESTL R9, R9 JZ repeat_extend_forward_end_encodeBlockAsm10B matchlen_single_loopback_repeat_extend_encodeBlockAsm10B: - MOVB (R9)(R11*1), R10 - CMPB (BP)(R11*1), R10 + MOVB (R10)(R12*1), R11 + CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm10B - LEAL 1(R11), R11 - DECL R8 + LEAL 1(R12), R12 + DECL R9 JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm10B repeat_extend_forward_end_encodeBlockAsm10B: - ADDL R11, CX - MOVL CX, BP - SUBL SI, BP - MOVL 16(SP), SI - TESTL DI, DI + ADDL R12, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsm10B // emitRepeat - MOVL BP, DI - LEAL -4(BP), BP - CMPL DI, $0x08 + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm10B - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsm10B cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: - CMPL BP, $0x00000104 + CMPL SI, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm10B - LEAL -256(BP), BP + LEAL -256(SI), SI MOVW $0x0019, (AX) - MOVW BP, 2(AX) + MOVW SI, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_three_match_repeat_encodeBlockAsm10B: - LEAL -4(BP), BP + LEAL -4(SI), SI MOVW $0x0015, (AX) - MOVB BP, 2(AX) + MOVB SI, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_match_repeat_encodeBlockAsm10B: - SHLL $0x02, BP - ORL $0x01, BP - MOVW BP, (AX) + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_offset_match_repeat_encodeBlockAsm10B: - XORQ DI, DI - LEAL 1(DI)(BP*4), BP - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_as_copy_encodeBlockAsm10B: // emitCopy two_byte_offset_repeat_as_copy_encodeBlockAsm10B: - CMPL BP, $0x40 + CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BP), BP + MOVW DI, 1(AX) + LEAL -60(SI), SI ADDQ $0x03, AX // emitRepeat - MOVL BP, DI - LEAL -4(BP), BP - CMPL DI, $0x08 + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - CMPL DI, $0x0c + CMPL R8, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - CMPL BP, $0x00000104 + CMPL SI, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - LEAL -256(BP), BP + LEAL -256(SI), SI MOVW $0x0019, (AX) - MOVW BP, 2(AX) + MOVW SI, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - LEAL -4(BP), BP + LEAL -4(SI), SI MOVW $0x0015, (AX) - MOVB BP, 2(AX) + MOVB SI, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - SHLL $0x02, BP - ORL $0x01, BP - MOVW BP, (AX) + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(BP*4), BP - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: - CMPL BP, $0x0c + CMPL SI, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B MOVB $0x01, BL - LEAL -16(BX)(BP*4), BP - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B emit_copy_three_repeat_as_copy_encodeBlockAsm10B: MOVB $0x02, BL - LEAL -4(BX)(BP*4), BP - MOVB BP, (AX) - MOVW SI, 1(AX) + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm10B: @@ -3656,16 +3466,16 @@ repeat_end_emit_encodeBlockAsm10B: JMP search_loop_encodeBlockAsm10B no_repeat_found_encodeBlockAsm10B: - CMPL (DX)(BP*1), SI + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBlockAsm10B - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BP - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm10B - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BP*1), SI + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm10B MOVL 20(SP), CX JMP search_loop_encodeBlockAsm10B @@ -3675,329 +3485,310 @@ candidate3_match_encodeBlockAsm10B: JMP candidate_match_encodeBlockAsm10B candidate2_match_encodeBlockAsm10B: - MOVL R8, 24(SP)(R9*4) + MOVL R9, 24(SP)(R10*4) INCL CX - MOVL DI, BP + MOVL R8, SI candidate_match_encodeBlockAsm10B: - MOVL 12(SP), SI - TESTL BP, BP + MOVL 12(SP), DI + TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm10B match_extend_back_loop_encodeBlockAsm10B: - CMPL CX, SI + CMPL CX, DI JLE match_extend_back_end_encodeBlockAsm10B - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(CX*1), DI - CMPB BL, DI + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 JNE match_extend_back_end_encodeBlockAsm10B LEAL -1(CX), CX - DECL BP + DECL SI JZ match_extend_back_end_encodeBlockAsm10B JMP match_extend_back_loop_encodeBlockAsm10B match_extend_back_end_encodeBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) JL match_dst_size_check_encodeBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm10B: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI JEQ emit_literal_done_match_emit_encodeBlockAsm10B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c JLT one_byte_match_emit_encodeBlockAsm10B - CMPL DI, $0x00000100 + CMPL R8, $0x00000100 JLT two_bytes_match_emit_encodeBlockAsm10B MOVB $0xf4, (AX) - MOVW DI, 1(AX) + MOVW R8, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBlockAsm10B two_bytes_match_emit_encodeBlockAsm10B: MOVB $0xf0, (AX) - MOVB DI, 1(AX) + MOVB R8, 1(AX) ADDQ $0x02, AX - CMPL DI, $0x40 + CMPL R8, $0x40 JL memmove_match_emit_encodeBlockAsm10B JMP memmove_long_match_emit_encodeBlockAsm10B one_byte_match_emit_encodeBlockAsm10B: - SHLB $0x02, DI - MOVB DI, (AX) + SHLB $0x02, R8 + MOVB R8, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm10B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_1or2: - MOVB (SI), R9 - MOVB -1(SI)(R8*1), SI - MOVB R9, (AX) - MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_3: - MOVW (SI), R9 - MOVB 2(SI), SI - MOVW R9, (AX) - MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_4through7: - MOVL (SI), R9 - MOVL -4(SI)(R8*1), SI - MOVL R9, (AX) - MOVL SI, -4(AX)(R8*1) +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_encodeBlockAsm10B: - MOVQ DI, AX + MOVQ R8, AX JMP emit_literal_done_match_emit_encodeBlockAsm10B memmove_long_match_emit_encodeBlockAsm10B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 + DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX emit_literal_done_match_emit_encodeBlockAsm10B: match_nolit_loop_encodeBlockAsm10B: - MOVL CX, SI - SUBL BP, SI - MOVL SI, 16(SP) + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), BP + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI // matchLen - XORL R9, R9 - CMPL SI, $0x08 + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm10B matchlen_loopback_match_nolit_encodeBlockAsm10B: - MOVQ (DI)(R9*1), R8 - XORQ (BP)(R9*1), R8 - TESTQ R8, R8 + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm10B - BSFQ R8, R8 - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeBlockAsm10B matchlen_loop_match_nolit_encodeBlockAsm10B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm10B matchlen_single_match_nolit_encodeBlockAsm10B: - TESTL SI, SI + TESTL DI, DI JZ match_nolit_end_encodeBlockAsm10B matchlen_single_loopback_match_nolit_encodeBlockAsm10B: - MOVB (DI)(R9*1), R8 - CMPB (BP)(R9*1), R8 + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm10B - LEAL 1(R9), R9 - DECL SI + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10B match_nolit_end_encodeBlockAsm10B: - ADDL R9, CX - MOVL 16(SP), BP - ADDL $0x04, R9 + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeBlockAsm10B: - CMPL R9, $0x40 + CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(R9), R9 + MOVW SI, 1(AX) + LEAL -60(R10), R10 ADDQ $0x03, AX // emitRepeat - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short - CMPL SI, $0x0c + CMPL DI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: - CMPL R9, $0x00000104 + CMPL R10, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short - LEAL -256(R9), R9 + LEAL -256(R10), R10 MOVW $0x0019, (AX) - MOVW R9, 2(AX) + MOVW R10, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: - LEAL -4(R9), R9 + LEAL -4(R10), R10 MOVW $0x0015, (AX) - MOVB R9, 2(AX) + MOVB R10, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B JMP two_byte_offset_match_nolit_encodeBlockAsm10B two_byte_offset_short_match_nolit_encodeBlockAsm10B: - CMPL R9, $0x0c + CMPL R10, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm10B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm10B MOVB $0x01, BL - LEAL -16(BX)(R9*4), R9 - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B emit_copy_three_match_nolit_encodeBlockAsm10B: MOVB $0x02, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) - MOVW BP, 1(AX) + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeBlockAsm10B: CMPL CX, 8(SP) JGE emit_remainder_encodeBlockAsm10B - MOVQ -2(DX)(CX*1), SI + MOVQ -2(DX)(CX*1), DI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsm10B: - MOVQ $0x9e3779b1, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BP - SHLQ $0x20, DI - IMULQ R8, DI - SHRQ $0x36, DI - SHLQ $0x20, BP - IMULQ R8, BP - SHRQ $0x36, BP - LEAL -2(CX), R8 - LEAQ 24(SP)(BP*4), R9 - MOVL (R9), BP - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BP*1), SI + MOVQ $0x9e3779b1, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x20, R8 + IMULQ R9, R8 + SHRQ $0x36, R8 + SHLQ $0x20, SI + IMULQ R9, SI + SHRQ $0x36, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI JEQ match_nolit_loop_encodeBlockAsm10B INCL CX JMP search_loop_encodeBlockAsm10B @@ -4016,11 +3807,11 @@ emit_remainder_ok_encodeBlockAsm10B: MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B - MOVL CX, BP + MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX - SUBL BX, BP - LEAL -1(BP), DX + SUBL BX, SI + LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBlockAsm10B CMPL DX, $0x00000100 @@ -4044,46 +3835,27 @@ one_byte_emit_remainder_encodeBlockAsm10B: ADDQ $0x01, AX memmove_emit_remainder_encodeBlockAsm10B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7 + JLE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2: - MOVB (CX), BP - MOVB -1(CX)(BX*1), CL - MOVB BP, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3: - MOVW (CX), BP - MOVB 2(CX), CL - MOVW BP, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7: - MOVL (CX), BP - MOVL -4(CX)(BX*1), CX - MOVL BP, (AX) - MOVL CX, -4(AX)(BX*1) +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B @@ -4109,43 +3881,43 @@ memmove_end_copy_emit_remainder_encodeBlockAsm10B: JMP emit_literal_done_emit_remainder_encodeBlockAsm10B memmove_long_emit_remainder_encodeBlockAsm10B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 - MOVQ BX, SI - SHRQ $0x05, SI - MOVQ AX, BP - ANDL $0x0000001f, BP - MOVQ $0x00000040, DI - SUBQ BP, DI - DECQ SI + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(CX)(DI*1), BP - LEAQ -32(AX)(DI*1), R8 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: - MOVOU (BP), X4 - MOVOU 16(BP), X5 - MOVOA X4, (R8) - MOVOA X5, 16(R8) + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI ADDQ $0x20, R8 - ADDQ $0x20, BP - ADDQ $0x20, DI - DECQ SI + DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(CX)(DI*1), X4 - MOVOU -16(CX)(DI*1), X5 - MOVOA X4, -32(AX)(DI*1) - MOVOA X5, -16(AX)(DI*1) - ADDQ $0x20, DI - CMPQ BX, DI + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) @@ -4181,9 +3953,9 @@ zero_loop_encodeBlockAsm8B: JNZ zero_loop_encodeBlockAsm8B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX - LEAQ -5(CX), DX - LEAQ -8(CX), BP - MOVL BP, 8(SP) + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -4193,356 +3965,337 @@ zero_loop_encodeBlockAsm8B: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm8B: - MOVQ (DX)(CX*1), SI - MOVL CX, BP - SUBL 12(SP), BP - SHRL $0x04, BP - LEAL 4(CX)(BP*1), BP - CMPL BP, 8(SP) + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x04, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) JGE emit_remainder_encodeBlockAsm8B - MOVL BP, 20(SP) - MOVQ $0x9e3779b1, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x38, R9 + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x9e3779b1, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 SHLQ $0x20, R10 - IMULQ R8, R10 + IMULQ R9, R10 SHRQ $0x38, R10 - MOVL 24(SP)(R9*4), BP - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x38, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 + SHLQ $0x20, R11 + IMULQ R9, R11 + SHRQ $0x38, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x38, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm8B - LEAL 1(CX), SI - MOVL 12(SP), DI - MOVL SI, BP - SUBL 16(SP), BP + LEAL 1(CX), DI + MOVL 12(SP), R8 + MOVL DI, SI + SUBL 16(SP), SI JZ repeat_extend_back_end_encodeBlockAsm8B repeat_extend_back_loop_encodeBlockAsm8B: - CMPL SI, DI + CMPL DI, R8 JLE repeat_extend_back_end_encodeBlockAsm8B - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(SI*1), R8 - CMPB BL, R8 + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 JNE repeat_extend_back_end_encodeBlockAsm8B - LEAL -1(SI), SI - DECL BP + LEAL -1(DI), DI + DECL SI JNZ repeat_extend_back_loop_encodeBlockAsm8B repeat_extend_back_end_encodeBlockAsm8B: - MOVL 12(SP), BP - CMPL BP, SI + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, R8 - LEAL -1(R8), BP - CMPL BP, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c JLT one_byte_repeat_emit_encodeBlockAsm8B - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsm8B MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeBlockAsm8B two_bytes_repeat_emit_encodeBlockAsm8B: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_repeat_emit_encodeBlockAsm8B JMP memmove_long_repeat_emit_encodeBlockAsm8B one_byte_repeat_emit_encodeBlockAsm8B: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm8B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8 + CMPQ R9, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_1or2: - MOVB (R9), R10 - MOVB -1(R9)(R8*1), R9 - MOVB R10, (AX) - MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_3: - MOVW (R9), R10 - MOVB 2(R9), R9 - MOVW R10, (AX) - MOVB R9, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_repeat_emit_encodeBlockAsm8B: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_repeat_emit_encodeBlockAsm8B memmove_long_repeat_emit_encodeBlockAsm8B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 + LEAQ -32(R10)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 + DECQ R12 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 + MOVOU -32(R10)(R13*1), X4 + MOVOU -16(R10)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R9, R13 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX emit_literal_done_repeat_emit_encodeBlockAsm8B: ADDL $0x05, CX - MOVL CX, BP - SUBL 16(SP), BP - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(BP*1), BP + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R9 + SUBL CX, R9 + LEAQ (DX)(CX*1), R10 + LEAQ (DX)(SI*1), SI // matchLen - XORL R11, R11 - CMPL R8, $0x08 + XORL R12, R12 + CMPL R9, $0x08 JL matchlen_single_repeat_extend_encodeBlockAsm8B matchlen_loopback_repeat_extend_encodeBlockAsm8B: - MOVQ (R9)(R11*1), R10 - XORQ (BP)(R11*1), R10 - TESTQ R10, R10 + MOVQ (R10)(R12*1), R11 + XORQ (SI)(R12*1), R11 + TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm8B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 JMP repeat_extend_forward_end_encodeBlockAsm8B matchlen_loop_repeat_extend_encodeBlockAsm8B: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 + LEAL -8(R9), R9 + LEAL 8(R12), R12 + CMPL R9, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B matchlen_single_repeat_extend_encodeBlockAsm8B: - TESTL R8, R8 + TESTL R9, R9 JZ repeat_extend_forward_end_encodeBlockAsm8B matchlen_single_loopback_repeat_extend_encodeBlockAsm8B: - MOVB (R9)(R11*1), R10 - CMPB (BP)(R11*1), R10 + MOVB (R10)(R12*1), R11 + CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm8B - LEAL 1(R11), R11 - DECL R8 + LEAL 1(R12), R12 + DECL R9 JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm8B repeat_extend_forward_end_encodeBlockAsm8B: - ADDL R11, CX - MOVL CX, BP - SUBL SI, BP - MOVL 16(SP), SI - TESTL DI, DI + ADDL R12, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsm8B // emitRepeat - MOVL BP, SI - LEAL -4(BP), BP - CMPL SI, $0x08 + MOVL SI, DI + LEAL -4(SI), SI + CMPL DI, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm8B - CMPL SI, $0x0c + CMPL DI, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: - CMPL BP, $0x00000104 + CMPL SI, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm8B - LEAL -256(BP), BP + LEAL -256(SI), SI MOVW $0x0019, (AX) - MOVW BP, 2(AX) + MOVW SI, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_three_match_repeat_encodeBlockAsm8B: - LEAL -4(BP), BP + LEAL -4(SI), SI MOVW $0x0015, (AX) - MOVB BP, 2(AX) + MOVB SI, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_two_match_repeat_encodeBlockAsm8B: - SHLL $0x02, BP - ORL $0x01, BP - MOVW BP, (AX) + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B - XORQ DI, DI - LEAL 1(DI)(BP*4), BP - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_as_copy_encodeBlockAsm8B: // emitCopy two_byte_offset_repeat_as_copy_encodeBlockAsm8B: - CMPL BP, $0x40 + CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BP), BP + MOVW DI, 1(AX) + LEAL -60(SI), SI ADDQ $0x03, AX // emitRepeat - MOVL BP, SI - LEAL -4(BP), BP - CMPL SI, $0x08 + MOVL SI, DI + LEAL -4(SI), SI + CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - CMPL SI, $0x0c + CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - CMPL BP, $0x00000104 + CMPL SI, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - LEAL -256(BP), BP + LEAL -256(SI), SI MOVW $0x0019, (AX) - MOVW BP, 2(AX) + MOVW SI, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - LEAL -4(BP), BP + LEAL -4(SI), SI MOVW $0x0015, (AX) - MOVB BP, 2(AX) + MOVB SI, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - SHLL $0x02, BP - ORL $0x01, BP - MOVW BP, (AX) + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B - XORQ DI, DI - LEAL 1(DI)(BP*4), BP - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: - CMPL BP, $0x0c + CMPL SI, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B MOVB $0x01, BL - LEAL -16(BX)(BP*4), BP - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B emit_copy_three_repeat_as_copy_encodeBlockAsm8B: MOVB $0x02, BL - LEAL -4(BX)(BP*4), BP - MOVB BP, (AX) - MOVW SI, 1(AX) + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm8B: @@ -4550,16 +4303,16 @@ repeat_end_emit_encodeBlockAsm8B: JMP search_loop_encodeBlockAsm8B no_repeat_found_encodeBlockAsm8B: - CMPL (DX)(BP*1), SI + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBlockAsm8B - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BP - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm8B - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BP*1), SI + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm8B MOVL 20(SP), CX JMP search_loop_encodeBlockAsm8B @@ -4569,323 +4322,304 @@ candidate3_match_encodeBlockAsm8B: JMP candidate_match_encodeBlockAsm8B candidate2_match_encodeBlockAsm8B: - MOVL R8, 24(SP)(R9*4) + MOVL R9, 24(SP)(R10*4) INCL CX - MOVL DI, BP + MOVL R8, SI candidate_match_encodeBlockAsm8B: - MOVL 12(SP), SI - TESTL BP, BP + MOVL 12(SP), DI + TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm8B match_extend_back_loop_encodeBlockAsm8B: - CMPL CX, SI + CMPL CX, DI JLE match_extend_back_end_encodeBlockAsm8B - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(CX*1), DI - CMPB BL, DI + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 JNE match_extend_back_end_encodeBlockAsm8B LEAL -1(CX), CX - DECL BP + DECL SI JZ match_extend_back_end_encodeBlockAsm8B JMP match_extend_back_loop_encodeBlockAsm8B match_extend_back_end_encodeBlockAsm8B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) JL match_dst_size_check_encodeBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm8B: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI JEQ emit_literal_done_match_emit_encodeBlockAsm8B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c JLT one_byte_match_emit_encodeBlockAsm8B - CMPL DI, $0x00000100 + CMPL R8, $0x00000100 JLT two_bytes_match_emit_encodeBlockAsm8B MOVB $0xf4, (AX) - MOVW DI, 1(AX) + MOVW R8, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBlockAsm8B two_bytes_match_emit_encodeBlockAsm8B: MOVB $0xf0, (AX) - MOVB DI, 1(AX) + MOVB R8, 1(AX) ADDQ $0x02, AX - CMPL DI, $0x40 + CMPL R8, $0x40 JL memmove_match_emit_encodeBlockAsm8B JMP memmove_long_match_emit_encodeBlockAsm8B one_byte_match_emit_encodeBlockAsm8B: - SHLB $0x02, DI - MOVB DI, (AX) + SHLB $0x02, R8 + MOVB R8, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm8B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_1or2: - MOVB (SI), R9 - MOVB -1(SI)(R8*1), SI - MOVB R9, (AX) - MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_3: - MOVW (SI), R9 - MOVB 2(SI), SI - MOVW R9, (AX) - MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_4through7: - MOVL (SI), R9 - MOVL -4(SI)(R8*1), SI - MOVL R9, (AX) - MOVL SI, -4(AX)(R8*1) +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_encodeBlockAsm8B: - MOVQ DI, AX + MOVQ R8, AX JMP emit_literal_done_match_emit_encodeBlockAsm8B memmove_long_match_emit_encodeBlockAsm8B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 + DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX emit_literal_done_match_emit_encodeBlockAsm8B: match_nolit_loop_encodeBlockAsm8B: - MOVL CX, SI - SUBL BP, SI - MOVL SI, 16(SP) + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), BP + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI // matchLen - XORL R9, R9 - CMPL SI, $0x08 + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm8B matchlen_loopback_match_nolit_encodeBlockAsm8B: - MOVQ (DI)(R9*1), R8 - XORQ (BP)(R9*1), R8 - TESTQ R8, R8 + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm8B - BSFQ R8, R8 - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeBlockAsm8B matchlen_loop_match_nolit_encodeBlockAsm8B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm8B matchlen_single_match_nolit_encodeBlockAsm8B: - TESTL SI, SI + TESTL DI, DI JZ match_nolit_end_encodeBlockAsm8B matchlen_single_loopback_match_nolit_encodeBlockAsm8B: - MOVB (DI)(R9*1), R8 - CMPB (BP)(R9*1), R8 + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm8B - LEAL 1(R9), R9 - DECL SI + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8B match_nolit_end_encodeBlockAsm8B: - ADDL R9, CX - MOVL 16(SP), BP - ADDL $0x04, R9 + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeBlockAsm8B: - CMPL R9, $0x40 + CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(R9), R9 + MOVW SI, 1(AX) + LEAL -60(R10), R10 ADDQ $0x03, AX // emitRepeat - MOVL R9, BP - LEAL -4(R9), R9 - CMPL BP, $0x08 + MOVL R10, SI + LEAL -4(R10), R10 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: - CMPL R9, $0x00000104 + CMPL R10, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short - LEAL -256(R9), R9 + LEAL -256(R10), R10 MOVW $0x0019, (AX) - MOVW R9, 2(AX) + MOVW R10, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: - LEAL -4(R9), R9 + LEAL -4(R10), R10 MOVW $0x0015, (AX) - MOVB R9, 2(AX) + MOVB R10, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B JMP two_byte_offset_match_nolit_encodeBlockAsm8B two_byte_offset_short_match_nolit_encodeBlockAsm8B: - CMPL R9, $0x0c + CMPL R10, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm8B MOVB $0x01, BL - LEAL -16(BX)(R9*4), R9 - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B emit_copy_three_match_nolit_encodeBlockAsm8B: MOVB $0x02, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) - MOVW BP, 1(AX) + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeBlockAsm8B: CMPL CX, 8(SP) JGE emit_remainder_encodeBlockAsm8B - MOVQ -2(DX)(CX*1), SI + MOVQ -2(DX)(CX*1), DI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsm8B: - MOVQ $0x9e3779b1, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BP - SHLQ $0x20, DI - IMULQ R8, DI - SHRQ $0x38, DI - SHLQ $0x20, BP - IMULQ R8, BP - SHRQ $0x38, BP - LEAL -2(CX), R8 - LEAQ 24(SP)(BP*4), R9 - MOVL (R9), BP - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BP*1), SI + MOVQ $0x9e3779b1, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x20, R8 + IMULQ R9, R8 + SHRQ $0x38, R8 + SHLQ $0x20, SI + IMULQ R9, SI + SHRQ $0x38, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI JEQ match_nolit_loop_encodeBlockAsm8B INCL CX JMP search_loop_encodeBlockAsm8B @@ -4904,11 +4638,11 @@ emit_remainder_ok_encodeBlockAsm8B: MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B - MOVL CX, BP + MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX - SUBL BX, BP - LEAL -1(BP), DX + SUBL BX, SI + LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBlockAsm8B CMPL DX, $0x00000100 @@ -4932,46 +4666,27 @@ one_byte_emit_remainder_encodeBlockAsm8B: ADDQ $0x01, AX memmove_emit_remainder_encodeBlockAsm8B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7 + JLE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2: - MOVB (CX), BP - MOVB -1(CX)(BX*1), CL - MOVB BP, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3: - MOVW (CX), BP - MOVB 2(CX), CL - MOVW BP, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7: - MOVL (CX), BP - MOVL -4(CX)(BX*1), CX - MOVL BP, (AX) - MOVL CX, -4(AX)(BX*1) +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B @@ -4997,43 +4712,43 @@ memmove_end_copy_emit_remainder_encodeBlockAsm8B: JMP emit_literal_done_emit_remainder_encodeBlockAsm8B memmove_long_emit_remainder_encodeBlockAsm8B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 - MOVQ BX, SI - SHRQ $0x05, SI - MOVQ AX, BP - ANDL $0x0000001f, BP - MOVQ $0x00000040, DI - SUBQ BP, DI - DECQ SI + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(CX)(DI*1), BP - LEAQ -32(AX)(DI*1), R8 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: - MOVOU (BP), X4 - MOVOU 16(BP), X5 - MOVOA X4, (R8) - MOVOA X5, 16(R8) + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI ADDQ $0x20, R8 - ADDQ $0x20, BP - ADDQ $0x20, DI - DECQ SI + DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(CX)(DI*1), X4 - MOVOU -16(CX)(DI*1), X5 - MOVOA X4, -32(AX)(DI*1) - MOVOA X5, -16(AX)(DI*1) - ADDQ $0x20, DI - CMPQ BX, DI + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) @@ -5070,8 +4785,8 @@ zero_loop_encodeBetterBlockAsm: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -6(CX), DX - LEAQ -8(CX), BP - MOVL BP, 8(SP) + LEAQ -8(CX), SI + MOVL SI, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -5081,329 +4796,327 @@ zero_loop_encodeBetterBlockAsm: MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm: - MOVQ (DX)(CX*1), SI - MOVL CX, BP - SUBL 12(SP), BP - SHRL $0x07, BP - LEAL 1(CX)(BP*1), BP - CMPL BP, 8(SP) + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x07, SI + CMPL SI, $0x63 + JLE check_maxskip_ok_encodeBetterBlockAsm + LEAL 100(CX), SI + JMP check_maxskip_cont_encodeBetterBlockAsm + +check_maxskip_ok_encodeBetterBlockAsm: + LEAL 1(CX)(SI*1), SI + +check_maxskip_cont_encodeBetterBlockAsm: + CMPL SI, 8(SP) JGE emit_remainder_encodeBetterBlockAsm - MOVL BP, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R8 - MOVQ $0x9e3779b1, BP - MOVQ SI, R9 - MOVQ SI, R10 - SHLQ $0x08, R9 - IMULQ R8, R9 - SHRQ $0x30, R9 - SHLQ $0x20, R10 - IMULQ BP, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R9*4), BP - MOVL 262168(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - MOVL CX, 262168(SP)(R10*4) - CMPL (DX)(BP*1), SI + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 262168(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 262168(SP)(R11*4) + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm - CMPL (DX)(DI*1), SI + CMPL (DX)(R8*1), DI JEQ candidateS_match_encodeBetterBlockAsm MOVL 20(SP), CX JMP search_loop_encodeBetterBlockAsm candidateS_match_encodeBetterBlockAsm: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x08, R9 - IMULQ R8, R9 - SHRQ $0x30, R9 - MOVL 24(SP)(R9*4), BP + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + MOVL 24(SP)(R10*4), SI INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BP*1), SI + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm DECL CX - MOVL DI, BP + MOVL R8, SI candidate_match_encodeBetterBlockAsm: - MOVL 12(SP), SI - TESTL BP, BP + MOVL 12(SP), DI + TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm match_extend_back_loop_encodeBetterBlockAsm: - CMPL CX, SI + CMPL CX, DI JLE match_extend_back_end_encodeBetterBlockAsm - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(CX*1), DI - CMPB BL, DI + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 JNE match_extend_back_end_encodeBetterBlockAsm LEAL -1(CX), CX - DECL BP + DECL SI JZ match_extend_back_end_encodeBetterBlockAsm JMP match_extend_back_loop_encodeBetterBlockAsm match_extend_back_end_encodeBetterBlockAsm: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 5(AX)(SI*1), SI - CMPQ SI, (SP) + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 5(AX)(DI*1), DI + CMPQ DI, (SP) JL match_dst_size_check_encodeBetterBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBetterBlockAsm: - MOVL CX, SI + MOVL CX, DI ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BP*1), R9 + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 // matchLen - XORL R11, R11 - CMPL DI, $0x08 + XORL R12, R12 + CMPL R8, $0x08 JL matchlen_single_match_nolit_encodeBetterBlockAsm matchlen_loopback_match_nolit_encodeBetterBlockAsm: - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - TESTQ R10, R10 + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 JMP match_nolit_end_encodeBetterBlockAsm matchlen_loop_match_nolit_encodeBetterBlockAsm: - LEAL -8(DI), DI - LEAL 8(R11), R11 - CMPL DI, $0x08 + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm matchlen_single_match_nolit_encodeBetterBlockAsm: - TESTL DI, DI + TESTL R8, R8 JZ match_nolit_end_encodeBetterBlockAsm matchlen_single_loopback_match_nolit_encodeBetterBlockAsm: - MOVB (R8)(R11*1), R10 - CMPB (R9)(R11*1), R10 + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm - LEAL 1(R11), R11 - DECL DI + LEAL 1(R12), R12 + DECL R8 JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm match_nolit_end_encodeBetterBlockAsm: - MOVL CX, DI - SUBL BP, DI + MOVL CX, R8 + SUBL SI, R8 // Check if repeat - CMPL 16(SP), DI + CMPL 16(SP), R8 JEQ match_is_repeat_encodeBetterBlockAsm - CMPL R11, $0x01 + CMPL R12, $0x01 JG match_length_ok_encodeBetterBlockAsm - CMPL DI, $0x0000ffff + CMPL R8, $0x0000ffff JLE match_length_ok_encodeBetterBlockAsm MOVL 20(SP), CX INCL CX JMP search_loop_encodeBetterBlockAsm match_length_ok_encodeBetterBlockAsm: - MOVL DI, 16(SP) - MOVL 12(SP), BP - CMPL BP, SI + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_match_emit_encodeBetterBlockAsm - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, R8 - LEAL -1(R8), BP - CMPL BP, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c JLT one_byte_match_emit_encodeBetterBlockAsm - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_match_emit_encodeBetterBlockAsm - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JLT three_bytes_match_emit_encodeBetterBlockAsm - CMPL BP, $0x01000000 + CMPL SI, $0x01000000 JLT four_bytes_match_emit_encodeBetterBlockAsm MOVB $0xfc, (AX) - MOVL BP, 1(AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP memmove_long_match_emit_encodeBetterBlockAsm four_bytes_match_emit_encodeBetterBlockAsm: - MOVL BP, R10 - SHRL $0x10, R10 + MOVL SI, R11 + SHRL $0x10, R11 MOVB $0xf8, (AX) - MOVW BP, 1(AX) - MOVB R10, 3(AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) ADDQ $0x04, AX JMP memmove_long_match_emit_encodeBetterBlockAsm three_bytes_match_emit_encodeBetterBlockAsm: MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBetterBlockAsm two_bytes_match_emit_encodeBetterBlockAsm: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_match_emit_encodeBetterBlockAsm JMP memmove_long_match_emit_encodeBetterBlockAsm one_byte_match_emit_encodeBetterBlockAsm: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBetterBlockAsm: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3 - CMPQ R8, $0x08 + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4 + CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2: - MOVB (R9), R10 - MOVB -1(R9)(R8*1), R9 - MOVB R10, (AX) - MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3: - MOVW (R9), R10 - MOVB 2(R9), R9 - MOVW R10, (AX) - MOVB R9, 2(AX) +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_encodeBetterBlockAsm: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_match_emit_encodeBetterBlockAsm memmove_long_match_emit_encodeBetterBlockAsm: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 + DECQ R13 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX emit_literal_done_match_emit_encodeBetterBlockAsm: - ADDL R11, CX - ADDL $0x04, R11 + ADDL R12, CX + ADDL $0x04, R12 MOVL CX, 12(SP) // emitCopy - CMPL DI, $0x00010000 + CMPL R8, $0x00010000 JL two_byte_offset_match_nolit_encodeBetterBlockAsm four_bytes_loop_back_match_nolit_encodeBetterBlockAsm: - CMPL R11, $0x40 + CMPL R12, $0x40 JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm MOVB $0xff, (AX) - MOVL DI, 1(AX) - LEAL -64(R11), R11 + MOVL R8, 1(AX) + LEAL -64(R12), R12 ADDQ $0x05, AX - CMPL R11, $0x04 + CMPL R12, $0x04 JL four_bytes_remain_match_nolit_encodeBetterBlockAsm // emitRepeat emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy: - MOVL R11, BP - LEAL -4(R11), R11 - CMPL BP, $0x08 + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL DI, $0x00000800 + CMPL R8, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: - CMPL R11, $0x00000104 + CMPL R12, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL R11, $0x00010100 + CMPL R12, $0x00010100 JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL R11, $0x0100ffff + CMPL R12, $0x0100ffff JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy - LEAL -16842747(R11), R11 + LEAL -16842747(R12), R12 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -5411,85 +5124,85 @@ cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: - LEAL -65536(R11), R11 - MOVL R11, DI + LEAL -65536(R12), R12 + MOVL R12, R8 MOVW $0x001d, (AX) - MOVW R11, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy: - LEAL -256(R11), R11 + LEAL -256(R12), R12 MOVW $0x0019, (AX) - MOVW R11, 2(AX) + MOVW R12, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy: - LEAL -4(R11), R11 + LEAL -4(R12), R12 MOVW $0x0015, (AX) - MOVB R11, 2(AX) + MOVB R12, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: - XORQ BP, BP - LEAL 1(BP)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm four_bytes_remain_match_nolit_encodeBetterBlockAsm: - TESTL R11, R11 + TESTL R12, R12 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm MOVB $0x03, BL - LEAL -4(BX)(R11*4), R11 - MOVB R11, (AX) - MOVL DI, 1(AX) + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVL R8, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm two_byte_offset_match_nolit_encodeBetterBlockAsm: - CMPL R11, $0x40 + CMPL R12, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(R11), R11 + MOVW R8, 1(AX) + LEAL -60(R12), R12 ADDQ $0x03, AX // emitRepeat emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short: - MOVL R11, BP - LEAL -4(R11), R11 - CMPL BP, $0x08 + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL DI, $0x00000800 + CMPL R8, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: - CMPL R11, $0x00000104 + CMPL R12, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL R11, $0x00010100 + CMPL R12, $0x00010100 JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL R11, $0x0100ffff + CMPL R12, $0x0100ffff JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short - LEAL -16842747(R11), R11 + LEAL -16842747(R12), R12 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -5497,253 +5210,243 @@ cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: - LEAL -65536(R11), R11 - MOVL R11, DI + LEAL -65536(R12), R12 + MOVL R12, R8 MOVW $0x001d, (AX) - MOVW R11, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short: - LEAL -256(R11), R11 + LEAL -256(R12), R12 MOVW $0x0019, (AX) - MOVW R11, 2(AX) + MOVW R12, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short: - LEAL -4(R11), R11 + LEAL -4(R12), R12 MOVW $0x0015, (AX) - MOVB R11, 2(AX) + MOVB R12, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: - XORQ BP, BP - LEAL 1(BP)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm JMP two_byte_offset_match_nolit_encodeBetterBlockAsm two_byte_offset_short_match_nolit_encodeBetterBlockAsm: - CMPL R11, $0x0c + CMPL R12, $0x0c JGE emit_copy_three_match_nolit_encodeBetterBlockAsm - CMPL DI, $0x00000800 + CMPL R8, $0x00000800 JGE emit_copy_three_match_nolit_encodeBetterBlockAsm MOVB $0x01, BL - LEAL -16(BX)(R11*4), R11 - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy_three_match_nolit_encodeBetterBlockAsm: MOVB $0x02, BL - LEAL -4(BX)(R11*4), R11 - MOVB R11, (AX) - MOVW DI, 1(AX) + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm match_is_repeat_encodeBetterBlockAsm: - MOVL 12(SP), BP - CMPL BP, SI + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, R8 - LEAL -1(R8), BP - CMPL BP, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c JLT one_byte_match_emit_repeat_encodeBetterBlockAsm - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm - CMPL BP, $0x01000000 + CMPL SI, $0x01000000 JLT four_bytes_match_emit_repeat_encodeBetterBlockAsm MOVB $0xfc, (AX) - MOVL BP, 1(AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm four_bytes_match_emit_repeat_encodeBetterBlockAsm: - MOVL BP, R10 - SHRL $0x10, R10 + MOVL SI, R11 + SHRL $0x10, R11 MOVB $0xf8, (AX) - MOVW BP, 1(AX) - MOVB R10, 3(AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) ADDQ $0x04, AX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm three_bytes_match_emit_repeat_encodeBetterBlockAsm: MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm two_bytes_match_emit_repeat_encodeBetterBlockAsm: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_match_emit_repeat_encodeBetterBlockAsm JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm one_byte_match_emit_repeat_encodeBetterBlockAsm: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_match_emit_repeat_encodeBetterBlockAsm: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_3 - CMPQ R8, $0x08 + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4 + CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_1or2: - MOVB (R9), R10 - MOVB -1(R9)(R8*1), R9 - MOVB R10, (AX) - MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_3: - MOVW (R9), R10 - MOVB 2(R9), R9 - MOVW R10, (AX) - MOVB R9, 2(AX) +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm memmove_long_match_emit_repeat_encodeBetterBlockAsm: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 + DECQ R13 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm: - ADDL R11, CX - ADDL $0x04, R11 + ADDL R12, CX + ADDL $0x04, R12 MOVL CX, 12(SP) // emitRepeat emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm: - MOVL R11, BP - LEAL -4(R11), R11 - CMPL BP, $0x08 + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm - CMPL DI, $0x00000800 + CMPL R8, $0x00000800 JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: - CMPL R11, $0x00000104 + CMPL R12, $0x00000104 JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm - CMPL R11, $0x00010100 + CMPL R12, $0x00010100 JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm - CMPL R11, $0x0100ffff + CMPL R12, $0x0100ffff JLT repeat_five_match_nolit_repeat_encodeBetterBlockAsm - LEAL -16842747(R11), R11 + LEAL -16842747(R12), R12 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -5751,44 +5454,44 @@ cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm repeat_five_match_nolit_repeat_encodeBetterBlockAsm: - LEAL -65536(R11), R11 - MOVL R11, DI + LEAL -65536(R12), R12 + MOVL R12, R8 MOVW $0x001d, (AX) - MOVW R11, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_four_match_nolit_repeat_encodeBetterBlockAsm: - LEAL -256(R11), R11 + LEAL -256(R12), R12 MOVW $0x0019, (AX) - MOVW R11, 2(AX) + MOVW R12, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_nolit_repeat_encodeBetterBlockAsm: - LEAL -4(R11), R11 + LEAL -4(R12), R12 MOVW $0x0015, (AX) - MOVB R11, 2(AX) + MOVB R12, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_nolit_repeat_encodeBetterBlockAsm: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: - XORQ BP, BP - LEAL 1(BP)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) ADDQ $0x02, AX match_nolit_emitcopy_end_encodeBetterBlockAsm: @@ -5800,36 +5503,53 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm: RET match_nolit_dst_ok_encodeBetterBlockAsm: - MOVQ $0x00cf1bbcdcbfa563, BP - MOVQ $0x9e3779b1, DI - INCL SI - MOVQ (DX)(SI*1), R8 - MOVQ R8, R9 - MOVQ R8, R10 - SHRQ $0x08, R10 - LEAL 1(SI), R11 - MOVQ -2(DX)(CX*1), R8 - SHLQ $0x08, R9 - IMULQ BP, R9 - SHRQ $0x30, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x32, R10 - MOVL SI, 24(SP)(R9*4) - MOVL R11, 262168(SP)(R10*4) - MOVQ R8, R9 - MOVQ R8, R10 - SHRQ $0x08, R10 - LEAL -2(CX), R8 - LEAL -1(CX), SI - SHLQ $0x08, R9 - IMULQ BP, R9 - SHRQ $0x30, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x32, R10 - MOVL R8, 24(SP)(R9*4) - MOVL SI, 262168(SP)(R10*4) + MOVQ $0x00cf1bbcdcbfa563, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x32, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 262168(SP)(R11*4) + MOVL R15, 262168(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 262168(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) JMP search_loop_encodeBetterBlockAsm emit_remainder_encodeBetterBlockAsm: @@ -5846,11 +5566,11 @@ emit_remainder_ok_encodeBetterBlockAsm: MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm - MOVL CX, BP + MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX - SUBL BX, BP - LEAL -1(BP), DX + SUBL BX, SI + LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBetterBlockAsm CMPL DX, $0x00000100 @@ -5893,13 +5613,12 @@ one_byte_emit_remainder_encodeBetterBlockAsm: ADDQ $0x01, AX memmove_emit_remainder_encodeBetterBlockAsm: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 + CMPQ BX, $0x04 + JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 @@ -5908,31 +5627,22 @@ memmove_emit_remainder_encodeBetterBlockAsm: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: - MOVB (CX), BP - MOVB -1(CX)(BX*1), CL - MOVB BP, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: - MOVW (CX), BP - MOVB 2(CX), CL - MOVW BP, (AX) - MOVB CL, 2(AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4: + MOVL (CX), SI + MOVL SI, (AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (CX), BP + MOVL (CX), SI MOVL -4(CX)(BX*1), CX - MOVL BP, (AX) + MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm @@ -5958,43 +5668,43 @@ memmove_end_copy_emit_remainder_encodeBetterBlockAsm: JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm memmove_long_emit_remainder_encodeBetterBlockAsm: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 - MOVQ BX, SI - SHRQ $0x05, SI - MOVQ AX, BP - ANDL $0x0000001f, BP - MOVQ $0x00000040, DI - SUBQ BP, DI - DECQ SI + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(CX)(DI*1), BP - LEAQ -32(AX)(DI*1), R8 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (BP), X4 - MOVOU 16(BP), X5 - MOVOA X4, (R8) - MOVOA X5, 16(R8) + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI ADDQ $0x20, R8 - ADDQ $0x20, BP - ADDQ $0x20, DI - DECQ SI + DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(CX)(DI*1), X4 - MOVOU -16(CX)(DI*1), X5 - MOVOA X4, -32(AX)(DI*1) - MOVOA X5, -16(AX)(DI*1) - ADDQ $0x20, DI - CMPQ BX, DI + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) @@ -6031,8 +5741,8 @@ zero_loop_encodeBetterBlockAsm4MB: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -6(CX), DX - LEAQ -8(CX), BP - MOVL BP, 8(SP) + LEAQ -8(CX), SI + MOVL SI, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -6042,665 +5752,653 @@ zero_loop_encodeBetterBlockAsm4MB: MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm4MB: - MOVQ (DX)(CX*1), SI - MOVL CX, BP - SUBL 12(SP), BP - SHRL $0x07, BP - LEAL 1(CX)(BP*1), BP - CMPL BP, 8(SP) + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x07, SI + CMPL SI, $0x63 + JLE check_maxskip_ok_encodeBetterBlockAsm4MB + LEAL 100(CX), SI + JMP check_maxskip_cont_encodeBetterBlockAsm4MB + +check_maxskip_ok_encodeBetterBlockAsm4MB: + LEAL 1(CX)(SI*1), SI + +check_maxskip_cont_encodeBetterBlockAsm4MB: + CMPL SI, 8(SP) JGE emit_remainder_encodeBetterBlockAsm4MB - MOVL BP, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R8 - MOVQ $0x9e3779b1, BP - MOVQ SI, R9 - MOVQ SI, R10 - SHLQ $0x08, R9 - IMULQ R8, R9 - SHRQ $0x30, R9 - SHLQ $0x20, R10 - IMULQ BP, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R9*4), BP - MOVL 262168(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - MOVL CX, 262168(SP)(R10*4) - CMPL (DX)(BP*1), SI + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 262168(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 262168(SP)(R11*4) + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm4MB - CMPL (DX)(DI*1), SI + CMPL (DX)(R8*1), DI JEQ candidateS_match_encodeBetterBlockAsm4MB MOVL 20(SP), CX JMP search_loop_encodeBetterBlockAsm4MB candidateS_match_encodeBetterBlockAsm4MB: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x08, R9 - IMULQ R8, R9 - SHRQ $0x30, R9 - MOVL 24(SP)(R9*4), BP + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + MOVL 24(SP)(R10*4), SI INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BP*1), SI + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm4MB DECL CX - MOVL DI, BP + MOVL R8, SI candidate_match_encodeBetterBlockAsm4MB: - MOVL 12(SP), SI - TESTL BP, BP + MOVL 12(SP), DI + TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm4MB match_extend_back_loop_encodeBetterBlockAsm4MB: - CMPL CX, SI + CMPL CX, DI JLE match_extend_back_end_encodeBetterBlockAsm4MB - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(CX*1), DI - CMPB BL, DI + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 JNE match_extend_back_end_encodeBetterBlockAsm4MB LEAL -1(CX), CX - DECL BP + DECL SI JZ match_extend_back_end_encodeBetterBlockAsm4MB JMP match_extend_back_loop_encodeBetterBlockAsm4MB match_extend_back_end_encodeBetterBlockAsm4MB: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI - CMPQ SI, (SP) + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 4(AX)(DI*1), DI + CMPQ DI, (SP) JL match_dst_size_check_encodeBetterBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBetterBlockAsm4MB: - MOVL CX, SI + MOVL CX, DI ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BP*1), R9 + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 // matchLen - XORL R11, R11 - CMPL DI, $0x08 + XORL R12, R12 + CMPL R8, $0x08 JL matchlen_single_match_nolit_encodeBetterBlockAsm4MB matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - TESTQ R10, R10 + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 JMP match_nolit_end_encodeBetterBlockAsm4MB matchlen_loop_match_nolit_encodeBetterBlockAsm4MB: - LEAL -8(DI), DI - LEAL 8(R11), R11 - CMPL DI, $0x08 + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB matchlen_single_match_nolit_encodeBetterBlockAsm4MB: - TESTL DI, DI + TESTL R8, R8 JZ match_nolit_end_encodeBetterBlockAsm4MB matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB: - MOVB (R8)(R11*1), R10 - CMPB (R9)(R11*1), R10 + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm4MB - LEAL 1(R11), R11 - DECL DI + LEAL 1(R12), R12 + DECL R8 JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB match_nolit_end_encodeBetterBlockAsm4MB: - MOVL CX, DI - SUBL BP, DI + MOVL CX, R8 + SUBL SI, R8 // Check if repeat - CMPL 16(SP), DI + CMPL 16(SP), R8 JEQ match_is_repeat_encodeBetterBlockAsm4MB - CMPL R11, $0x01 + CMPL R12, $0x01 JG match_length_ok_encodeBetterBlockAsm4MB - CMPL DI, $0x0000ffff + CMPL R8, $0x0000ffff JLE match_length_ok_encodeBetterBlockAsm4MB MOVL 20(SP), CX INCL CX JMP search_loop_encodeBetterBlockAsm4MB match_length_ok_encodeBetterBlockAsm4MB: - MOVL DI, 16(SP) - MOVL 12(SP), BP - CMPL BP, SI + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, R8 - LEAL -1(R8), BP - CMPL BP, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c JLT one_byte_match_emit_encodeBetterBlockAsm4MB - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_match_emit_encodeBetterBlockAsm4MB - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JLT three_bytes_match_emit_encodeBetterBlockAsm4MB - MOVL BP, R10 - SHRL $0x10, R10 + MOVL SI, R11 + SHRL $0x10, R11 MOVB $0xf8, (AX) - MOVW BP, 1(AX) - MOVB R10, 3(AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) ADDQ $0x04, AX JMP memmove_long_match_emit_encodeBetterBlockAsm4MB three_bytes_match_emit_encodeBetterBlockAsm4MB: MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBetterBlockAsm4MB two_bytes_match_emit_encodeBetterBlockAsm4MB: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_match_emit_encodeBetterBlockAsm4MB JMP memmove_long_match_emit_encodeBetterBlockAsm4MB one_byte_match_emit_encodeBetterBlockAsm4MB: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBetterBlockAsm4MB: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_3 - CMPQ R8, $0x08 + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4 + CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_1or2: - MOVB (R9), R10 - MOVB -1(R9)(R8*1), R9 - MOVB R10, (AX) - MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_3: - MOVW (R9), R10 - MOVB 2(R9), R9 - MOVW R10, (AX) - MOVB R9, 2(AX) +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_encodeBetterBlockAsm4MB: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB memmove_long_match_emit_encodeBetterBlockAsm4MB: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 + DECQ R13 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX emit_literal_done_match_emit_encodeBetterBlockAsm4MB: - ADDL R11, CX - ADDL $0x04, R11 + ADDL R12, CX + ADDL $0x04, R12 MOVL CX, 12(SP) // emitCopy - CMPL DI, $0x00010000 + CMPL R8, $0x00010000 JL two_byte_offset_match_nolit_encodeBetterBlockAsm4MB four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB: - CMPL R11, $0x40 + CMPL R12, $0x40 JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB MOVB $0xff, (AX) - MOVL DI, 1(AX) - LEAL -64(R11), R11 + MOVL R8, 1(AX) + LEAL -64(R12), R12 ADDQ $0x05, AX - CMPL R11, $0x04 + CMPL R12, $0x04 JL four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB // emitRepeat - MOVL R11, BP - LEAL -4(R11), R11 - CMPL BP, $0x08 + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy - CMPL DI, $0x00000800 + CMPL R8, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - CMPL R11, $0x00000104 + CMPL R12, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy - CMPL R11, $0x00010100 + CMPL R12, $0x00010100 JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy - LEAL -65536(R11), R11 - MOVL R11, DI + LEAL -65536(R12), R12 + MOVL R12, R8 MOVW $0x001d, (AX) - MOVW R11, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - LEAL -256(R11), R11 + LEAL -256(R12), R12 MOVW $0x0019, (AX) - MOVW R11, 2(AX) + MOVW R12, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - LEAL -4(R11), R11 + LEAL -4(R12), R12 MOVW $0x0015, (AX) - MOVB R11, 2(AX) + MOVB R12, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - XORQ BP, BP - LEAL 1(BP)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB: - TESTL R11, R11 + TESTL R12, R12 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB MOVB $0x03, BL - LEAL -4(BX)(R11*4), R11 - MOVB R11, (AX) - MOVL DI, 1(AX) + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVL R8, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB two_byte_offset_match_nolit_encodeBetterBlockAsm4MB: - CMPL R11, $0x40 + CMPL R12, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(R11), R11 + MOVW R8, 1(AX) + LEAL -60(R12), R12 ADDQ $0x03, AX // emitRepeat - MOVL R11, BP - LEAL -4(R11), R11 - CMPL BP, $0x08 + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - CMPL DI, $0x00000800 + CMPL R8, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - CMPL R11, $0x00000104 + CMPL R12, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - CMPL R11, $0x00010100 + CMPL R12, $0x00010100 JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - LEAL -65536(R11), R11 - MOVL R11, DI + LEAL -65536(R12), R12 + MOVL R12, R8 MOVW $0x001d, (AX) - MOVW R11, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - LEAL -256(R11), R11 + LEAL -256(R12), R12 MOVW $0x0019, (AX) - MOVW R11, 2(AX) + MOVW R12, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - LEAL -4(R11), R11 + LEAL -4(R12), R12 MOVW $0x0015, (AX) - MOVB R11, 2(AX) + MOVB R12, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - XORQ BP, BP - LEAL 1(BP)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB: - CMPL R11, $0x0c + CMPL R12, $0x0c JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB - CMPL DI, $0x00000800 + CMPL R8, $0x00000800 JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB MOVB $0x01, BL - LEAL -16(BX)(R11*4), R11 - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB emit_copy_three_match_nolit_encodeBetterBlockAsm4MB: MOVB $0x02, BL - LEAL -4(BX)(R11*4), R11 - MOVB R11, (AX) - MOVW DI, 1(AX) + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB match_is_repeat_encodeBetterBlockAsm4MB: - MOVL 12(SP), BP - CMPL BP, SI + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, R8 - LEAL -1(R8), BP - CMPL BP, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c JLT one_byte_match_emit_repeat_encodeBetterBlockAsm4MB - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB - MOVL BP, R10 - SHRL $0x10, R10 + MOVL SI, R11 + SHRL $0x10, R11 MOVB $0xf8, (AX) - MOVW BP, 1(AX) - MOVB R10, 3(AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) ADDQ $0x04, AX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_match_emit_repeat_encodeBetterBlockAsm4MB JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB one_byte_match_emit_repeat_encodeBetterBlockAsm4MB: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_match_emit_repeat_encodeBetterBlockAsm4MB: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_1or2 - JE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_3 - CMPQ R8, $0x08 + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4 + CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_1or2: - MOVB (R9), R10 - MOVB -1(R9)(R8*1), R9 - MOVB R10, (AX) - MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_3: - MOVW (R9), R10 - MOVB 2(R9), R9 - MOVW R10, (AX) - MOVB R9, 2(AX) +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 + DECQ R13 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB: - ADDL R11, CX - ADDL $0x04, R11 + ADDL R12, CX + ADDL $0x04, R12 MOVL CX, 12(SP) // emitRepeat - MOVL R11, BP - LEAL -4(R11), R11 - CMPL BP, $0x08 + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB - CMPL DI, $0x00000800 + CMPL R8, $0x00000800 JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: - CMPL R11, $0x00000104 + CMPL R12, $0x00000104 JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB - CMPL R11, $0x00010100 + CMPL R12, $0x00010100 JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB - LEAL -65536(R11), R11 - MOVL R11, DI + LEAL -65536(R12), R12 + MOVL R12, R8 MOVW $0x001d, (AX) - MOVW R11, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB: - LEAL -256(R11), R11 + LEAL -256(R12), R12 MOVW $0x0019, (AX) - MOVW R11, 2(AX) + MOVW R12, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB: - LEAL -4(R11), R11 + LEAL -4(R12), R12 MOVW $0x0015, (AX) - MOVB R11, 2(AX) + MOVB R12, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: - XORQ BP, BP - LEAL 1(BP)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) ADDQ $0x02, AX match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: @@ -6712,36 +6410,53 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: RET match_nolit_dst_ok_encodeBetterBlockAsm4MB: - MOVQ $0x00cf1bbcdcbfa563, BP - MOVQ $0x9e3779b1, DI - INCL SI - MOVQ (DX)(SI*1), R8 - MOVQ R8, R9 - MOVQ R8, R10 - SHRQ $0x08, R10 - LEAL 1(SI), R11 - MOVQ -2(DX)(CX*1), R8 - SHLQ $0x08, R9 - IMULQ BP, R9 - SHRQ $0x30, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x32, R10 - MOVL SI, 24(SP)(R9*4) - MOVL R11, 262168(SP)(R10*4) - MOVQ R8, R9 - MOVQ R8, R10 - SHRQ $0x08, R10 - LEAL -2(CX), R8 - LEAL -1(CX), SI - SHLQ $0x08, R9 - IMULQ BP, R9 - SHRQ $0x30, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x32, R10 - MOVL R8, 24(SP)(R9*4) - MOVL SI, 262168(SP)(R10*4) + MOVQ $0x00cf1bbcdcbfa563, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x32, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 262168(SP)(R11*4) + MOVL R15, 262168(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 262168(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) JMP search_loop_encodeBetterBlockAsm4MB emit_remainder_encodeBetterBlockAsm4MB: @@ -6758,11 +6473,11 @@ emit_remainder_ok_encodeBetterBlockAsm4MB: MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB - MOVL CX, BP + MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX - SUBL BX, BP - LEAL -1(BP), DX + SUBL BX, SI + LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBetterBlockAsm4MB CMPL DX, $0x00000100 @@ -6797,13 +6512,12 @@ one_byte_emit_remainder_encodeBetterBlockAsm4MB: ADDQ $0x01, AX memmove_emit_remainder_encodeBetterBlockAsm4MB: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3 + CMPQ BX, $0x04 + JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 CMPQ BX, $0x10 @@ -6812,31 +6526,22 @@ memmove_emit_remainder_encodeBetterBlockAsm4MB: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2: - MOVB (CX), BP - MOVB -1(CX)(BX*1), CL - MOVB BP, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3: - MOVW (CX), BP - MOVB 2(CX), CL - MOVW BP, (AX) - MOVB CL, 2(AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4: + MOVL (CX), SI + MOVL SI, (AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (CX), BP + MOVL (CX), SI MOVL -4(CX)(BX*1), CX - MOVL BP, (AX) + MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB @@ -6862,43 +6567,43 @@ memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB: JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB memmove_long_emit_remainder_encodeBetterBlockAsm4MB: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 - MOVQ BX, SI - SHRQ $0x05, SI - MOVQ AX, BP - ANDL $0x0000001f, BP - MOVQ $0x00000040, DI - SUBQ BP, DI - DECQ SI + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(CX)(DI*1), BP - LEAQ -32(AX)(DI*1), R8 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (BP), X4 - MOVOU 16(BP), X5 - MOVOA X4, (R8) - MOVOA X5, 16(R8) + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI ADDQ $0x20, R8 - ADDQ $0x20, BP - ADDQ $0x20, DI - DECQ SI + DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(CX)(DI*1), X4 - MOVOU -16(CX)(DI*1), X5 - MOVOA X4, -32(AX)(DI*1) - MOVOA X5, -16(AX)(DI*1) - ADDQ $0x20, DI - CMPQ BX, DI + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) @@ -6935,8 +6640,8 @@ zero_loop_encodeBetterBlockAsm12B: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -6(CX), DX - LEAQ -8(CX), BP - MOVL BP, 8(SP) + LEAQ -8(CX), SI + MOVL SI, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -6946,530 +6651,510 @@ zero_loop_encodeBetterBlockAsm12B: MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm12B: - MOVQ (DX)(CX*1), SI - MOVL CX, BP - SUBL 12(SP), BP - SHRL $0x06, BP - LEAL 1(CX)(BP*1), BP - CMPL BP, 8(SP) + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) JGE emit_remainder_encodeBetterBlockAsm12B - MOVL BP, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ $0x9e3779b1, BP - MOVQ SI, R9 - MOVQ SI, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - SHLQ $0x20, R10 - IMULQ BP, R10 - SHRQ $0x34, R10 - MOVL 24(SP)(R9*4), BP - MOVL 65560(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - MOVL CX, 65560(SP)(R10*4) - CMPL (DX)(BP*1), SI + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x34, R11 + MOVL 24(SP)(R10*4), SI + MOVL 65560(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 65560(SP)(R11*4) + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm12B - CMPL (DX)(DI*1), SI + CMPL (DX)(R8*1), DI JEQ candidateS_match_encodeBetterBlockAsm12B MOVL 20(SP), CX JMP search_loop_encodeBetterBlockAsm12B candidateS_match_encodeBetterBlockAsm12B: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - MOVL 24(SP)(R9*4), BP + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R10*4), SI INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BP*1), SI + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm12B DECL CX - MOVL DI, BP + MOVL R8, SI candidate_match_encodeBetterBlockAsm12B: - MOVL 12(SP), SI - TESTL BP, BP + MOVL 12(SP), DI + TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm12B match_extend_back_loop_encodeBetterBlockAsm12B: - CMPL CX, SI + CMPL CX, DI JLE match_extend_back_end_encodeBetterBlockAsm12B - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(CX*1), DI - CMPB BL, DI + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 JNE match_extend_back_end_encodeBetterBlockAsm12B LEAL -1(CX), CX - DECL BP + DECL SI JZ match_extend_back_end_encodeBetterBlockAsm12B JMP match_extend_back_loop_encodeBetterBlockAsm12B match_extend_back_end_encodeBetterBlockAsm12B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) JL match_dst_size_check_encodeBetterBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBetterBlockAsm12B: - MOVL CX, SI + MOVL CX, DI ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BP*1), R9 + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 // matchLen - XORL R11, R11 - CMPL DI, $0x08 + XORL R12, R12 + CMPL R8, $0x08 JL matchlen_single_match_nolit_encodeBetterBlockAsm12B matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - TESTQ R10, R10 + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 JMP match_nolit_end_encodeBetterBlockAsm12B matchlen_loop_match_nolit_encodeBetterBlockAsm12B: - LEAL -8(DI), DI - LEAL 8(R11), R11 - CMPL DI, $0x08 + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B matchlen_single_match_nolit_encodeBetterBlockAsm12B: - TESTL DI, DI + TESTL R8, R8 JZ match_nolit_end_encodeBetterBlockAsm12B matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B: - MOVB (R8)(R11*1), R10 - CMPB (R9)(R11*1), R10 + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm12B - LEAL 1(R11), R11 - DECL DI + LEAL 1(R12), R12 + DECL R8 JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B match_nolit_end_encodeBetterBlockAsm12B: - MOVL CX, DI - SUBL BP, DI + MOVL CX, R8 + SUBL SI, R8 // Check if repeat - CMPL 16(SP), DI + CMPL 16(SP), R8 JEQ match_is_repeat_encodeBetterBlockAsm12B - MOVL DI, 16(SP) - MOVL 12(SP), BP - CMPL BP, SI + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, R8 - LEAL -1(R8), BP - CMPL BP, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c JLT one_byte_match_emit_encodeBetterBlockAsm12B - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_match_emit_encodeBetterBlockAsm12B MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBetterBlockAsm12B two_bytes_match_emit_encodeBetterBlockAsm12B: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_match_emit_encodeBetterBlockAsm12B JMP memmove_long_match_emit_encodeBetterBlockAsm12B one_byte_match_emit_encodeBetterBlockAsm12B: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBetterBlockAsm12B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_3 - CMPQ R8, $0x08 + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4 + CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_1or2: - MOVB (R9), R10 - MOVB -1(R9)(R8*1), R9 - MOVB R10, (AX) - MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_3: - MOVW (R9), R10 - MOVB 2(R9), R9 - MOVW R10, (AX) - MOVB R9, 2(AX) +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_encodeBetterBlockAsm12B: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B memmove_long_match_emit_encodeBetterBlockAsm12B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 + DECQ R13 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX emit_literal_done_match_emit_encodeBetterBlockAsm12B: - ADDL R11, CX - ADDL $0x04, R11 + ADDL R12, CX + ADDL $0x04, R12 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeBetterBlockAsm12B: - CMPL R11, $0x40 + CMPL R12, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(R11), R11 + MOVW R8, 1(AX) + LEAL -60(R12), R12 ADDQ $0x03, AX // emitRepeat - MOVL R11, BP - LEAL -4(R11), R11 - CMPL BP, $0x08 + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - CMPL DI, $0x00000800 + CMPL R8, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - CMPL R11, $0x00000104 + CMPL R12, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - LEAL -256(R11), R11 + LEAL -256(R12), R12 MOVW $0x0019, (AX) - MOVW R11, 2(AX) + MOVW R12, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - LEAL -4(R11), R11 + LEAL -4(R12), R12 MOVW $0x0015, (AX) - MOVB R11, 2(AX) + MOVB R12, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - XORQ BP, BP - LEAL 1(BP)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: - CMPL R11, $0x0c + CMPL R12, $0x0c JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B - CMPL DI, $0x00000800 + CMPL R8, $0x00000800 JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B MOVB $0x01, BL - LEAL -16(BX)(R11*4), R11 - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B emit_copy_three_match_nolit_encodeBetterBlockAsm12B: MOVB $0x02, BL - LEAL -4(BX)(R11*4), R11 - MOVB R11, (AX) - MOVW DI, 1(AX) + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B match_is_repeat_encodeBetterBlockAsm12B: - MOVL 12(SP), BP - CMPL BP, SI + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, R8 - LEAL -1(R8), BP - CMPL BP, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c JLT one_byte_match_emit_repeat_encodeBetterBlockAsm12B - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm12B MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B two_bytes_match_emit_repeat_encodeBetterBlockAsm12B: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_match_emit_repeat_encodeBetterBlockAsm12B JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B one_byte_match_emit_repeat_encodeBetterBlockAsm12B: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_match_emit_repeat_encodeBetterBlockAsm12B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_3 - CMPQ R8, $0x08 + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4 + CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_1or2: - MOVB (R9), R10 - MOVB -1(R9)(R8*1), R9 - MOVB R10, (AX) - MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_3: - MOVW (R9), R10 - MOVB 2(R9), R9 - MOVW R10, (AX) - MOVB R9, 2(AX) +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B memmove_long_match_emit_repeat_encodeBetterBlockAsm12B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 + DECQ R13 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B: - ADDL R11, CX - ADDL $0x04, R11 + ADDL R12, CX + ADDL $0x04, R12 MOVL CX, 12(SP) // emitRepeat - MOVL R11, BP - LEAL -4(R11), R11 - CMPL BP, $0x08 + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B - CMPL DI, $0x00000800 + CMPL R8, $0x00000800 JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: - CMPL R11, $0x00000104 + CMPL R12, $0x00000104 JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B - LEAL -256(R11), R11 + LEAL -256(R12), R12 MOVW $0x0019, (AX) - MOVW R11, 2(AX) + MOVW R12, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B: - LEAL -4(R11), R11 + LEAL -4(R12), R12 MOVW $0x0015, (AX) - MOVB R11, 2(AX) + MOVB R12, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: - XORQ BP, BP - LEAL 1(BP)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) ADDQ $0x02, AX match_nolit_emitcopy_end_encodeBetterBlockAsm12B: @@ -7481,36 +7166,53 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm12B: RET match_nolit_dst_ok_encodeBetterBlockAsm12B: - MOVQ $0x0000cf1bbcdcbf9b, BP - MOVQ $0x9e3779b1, DI - INCL SI - MOVQ (DX)(SI*1), R8 - MOVQ R8, R9 - MOVQ R8, R10 - SHRQ $0x08, R10 - LEAL 1(SI), R11 - MOVQ -2(DX)(CX*1), R8 - SHLQ $0x10, R9 - IMULQ BP, R9 - SHRQ $0x32, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x34, R10 - MOVL SI, 24(SP)(R9*4) - MOVL R11, 65560(SP)(R10*4) - MOVQ R8, R9 - MOVQ R8, R10 - SHRQ $0x08, R10 - LEAL -2(CX), R8 - LEAL -1(CX), SI - SHLQ $0x10, R9 - IMULQ BP, R9 - SHRQ $0x32, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x34, R10 - MOVL R8, 24(SP)(R9*4) - MOVL SI, 65560(SP)(R10*4) + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x32, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x32, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x34, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x34, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 65560(SP)(R11*4) + MOVL R15, 65560(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x32, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x34, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x32, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 65560(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) JMP search_loop_encodeBetterBlockAsm12B emit_remainder_encodeBetterBlockAsm12B: @@ -7527,11 +7229,11 @@ emit_remainder_ok_encodeBetterBlockAsm12B: MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B - MOVL CX, BP + MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX - SUBL BX, BP - LEAL -1(BP), DX + SUBL BX, SI + LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBetterBlockAsm12B CMPL DX, $0x00000100 @@ -7555,13 +7257,12 @@ one_byte_emit_remainder_encodeBetterBlockAsm12B: ADDQ $0x01, AX memmove_emit_remainder_encodeBetterBlockAsm12B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3 + CMPQ BX, $0x04 + JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 @@ -7570,31 +7271,22 @@ memmove_emit_remainder_encodeBetterBlockAsm12B: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2: - MOVB (CX), BP - MOVB -1(CX)(BX*1), CL - MOVB BP, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3: - MOVW (CX), BP - MOVB 2(CX), CL - MOVW BP, (AX) - MOVB CL, 2(AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4: + MOVL (CX), SI + MOVL SI, (AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (CX), BP + MOVL (CX), SI MOVL -4(CX)(BX*1), CX - MOVL BP, (AX) + MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B @@ -7620,43 +7312,43 @@ memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B: JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B memmove_long_emit_remainder_encodeBetterBlockAsm12B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 - MOVQ BX, SI - SHRQ $0x05, SI - MOVQ AX, BP - ANDL $0x0000001f, BP - MOVQ $0x00000040, DI - SUBQ BP, DI - DECQ SI + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(CX)(DI*1), BP - LEAQ -32(AX)(DI*1), R8 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (BP), X4 - MOVOU 16(BP), X5 - MOVOA X4, (R8) - MOVOA X5, 16(R8) + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI ADDQ $0x20, R8 - ADDQ $0x20, BP - ADDQ $0x20, DI - DECQ SI + DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(CX)(DI*1), X4 - MOVOU -16(CX)(DI*1), X5 - MOVOA X4, -32(AX)(DI*1) - MOVOA X5, -16(AX)(DI*1) - ADDQ $0x20, DI - CMPQ BX, DI + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) @@ -7693,8 +7385,8 @@ zero_loop_encodeBetterBlockAsm10B: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -6(CX), DX - LEAQ -8(CX), BP - MOVL BP, 8(SP) + LEAQ -8(CX), SI + MOVL SI, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -7704,530 +7396,510 @@ zero_loop_encodeBetterBlockAsm10B: MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm10B: - MOVQ (DX)(CX*1), SI - MOVL CX, BP - SUBL 12(SP), BP - SHRL $0x05, BP - LEAL 1(CX)(BP*1), BP - CMPL BP, 8(SP) + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) JGE emit_remainder_encodeBetterBlockAsm10B - MOVL BP, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ $0x9e3779b1, BP - MOVQ SI, R9 - MOVQ SI, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 - SHLQ $0x20, R10 - IMULQ BP, R10 - SHRQ $0x36, R10 - MOVL 24(SP)(R9*4), BP - MOVL 16408(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - MOVL CX, 16408(SP)(R10*4) - CMPL (DX)(BP*1), SI + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x36, R11 + MOVL 24(SP)(R10*4), SI + MOVL 16408(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 16408(SP)(R11*4) + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm10B - CMPL (DX)(DI*1), SI + CMPL (DX)(R8*1), DI JEQ candidateS_match_encodeBetterBlockAsm10B MOVL 20(SP), CX JMP search_loop_encodeBetterBlockAsm10B candidateS_match_encodeBetterBlockAsm10B: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 - MOVL 24(SP)(R9*4), BP + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + MOVL 24(SP)(R10*4), SI INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BP*1), SI + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm10B DECL CX - MOVL DI, BP + MOVL R8, SI candidate_match_encodeBetterBlockAsm10B: - MOVL 12(SP), SI - TESTL BP, BP + MOVL 12(SP), DI + TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm10B match_extend_back_loop_encodeBetterBlockAsm10B: - CMPL CX, SI + CMPL CX, DI JLE match_extend_back_end_encodeBetterBlockAsm10B - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(CX*1), DI - CMPB BL, DI + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 JNE match_extend_back_end_encodeBetterBlockAsm10B LEAL -1(CX), CX - DECL BP + DECL SI JZ match_extend_back_end_encodeBetterBlockAsm10B JMP match_extend_back_loop_encodeBetterBlockAsm10B match_extend_back_end_encodeBetterBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) JL match_dst_size_check_encodeBetterBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBetterBlockAsm10B: - MOVL CX, SI + MOVL CX, DI ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BP*1), R9 + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 // matchLen - XORL R11, R11 - CMPL DI, $0x08 + XORL R12, R12 + CMPL R8, $0x08 JL matchlen_single_match_nolit_encodeBetterBlockAsm10B matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - TESTQ R10, R10 + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 JMP match_nolit_end_encodeBetterBlockAsm10B matchlen_loop_match_nolit_encodeBetterBlockAsm10B: - LEAL -8(DI), DI - LEAL 8(R11), R11 - CMPL DI, $0x08 + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B matchlen_single_match_nolit_encodeBetterBlockAsm10B: - TESTL DI, DI + TESTL R8, R8 JZ match_nolit_end_encodeBetterBlockAsm10B matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B: - MOVB (R8)(R11*1), R10 - CMPB (R9)(R11*1), R10 + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm10B - LEAL 1(R11), R11 - DECL DI + LEAL 1(R12), R12 + DECL R8 JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B match_nolit_end_encodeBetterBlockAsm10B: - MOVL CX, DI - SUBL BP, DI + MOVL CX, R8 + SUBL SI, R8 // Check if repeat - CMPL 16(SP), DI + CMPL 16(SP), R8 JEQ match_is_repeat_encodeBetterBlockAsm10B - MOVL DI, 16(SP) - MOVL 12(SP), BP - CMPL BP, SI + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, R8 - LEAL -1(R8), BP - CMPL BP, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c JLT one_byte_match_emit_encodeBetterBlockAsm10B - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_match_emit_encodeBetterBlockAsm10B MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBetterBlockAsm10B two_bytes_match_emit_encodeBetterBlockAsm10B: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_match_emit_encodeBetterBlockAsm10B JMP memmove_long_match_emit_encodeBetterBlockAsm10B one_byte_match_emit_encodeBetterBlockAsm10B: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBetterBlockAsm10B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_3 - CMPQ R8, $0x08 + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4 + CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_1or2: - MOVB (R9), R10 - MOVB -1(R9)(R8*1), R9 - MOVB R10, (AX) - MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_3: - MOVW (R9), R10 - MOVB 2(R9), R9 - MOVW R10, (AX) - MOVB R9, 2(AX) +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_encodeBetterBlockAsm10B: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B memmove_long_match_emit_encodeBetterBlockAsm10B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 + DECQ R13 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX emit_literal_done_match_emit_encodeBetterBlockAsm10B: - ADDL R11, CX - ADDL $0x04, R11 + ADDL R12, CX + ADDL $0x04, R12 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeBetterBlockAsm10B: - CMPL R11, $0x40 + CMPL R12, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(R11), R11 + MOVW R8, 1(AX) + LEAL -60(R12), R12 ADDQ $0x03, AX // emitRepeat - MOVL R11, BP - LEAL -4(R11), R11 - CMPL BP, $0x08 + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - CMPL DI, $0x00000800 + CMPL R8, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - CMPL R11, $0x00000104 + CMPL R12, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - LEAL -256(R11), R11 + LEAL -256(R12), R12 MOVW $0x0019, (AX) - MOVW R11, 2(AX) + MOVW R12, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - LEAL -4(R11), R11 + LEAL -4(R12), R12 MOVW $0x0015, (AX) - MOVB R11, 2(AX) + MOVB R12, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - XORQ BP, BP - LEAL 1(BP)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: - CMPL R11, $0x0c + CMPL R12, $0x0c JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B - CMPL DI, $0x00000800 + CMPL R8, $0x00000800 JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B MOVB $0x01, BL - LEAL -16(BX)(R11*4), R11 - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B emit_copy_three_match_nolit_encodeBetterBlockAsm10B: MOVB $0x02, BL - LEAL -4(BX)(R11*4), R11 - MOVB R11, (AX) - MOVW DI, 1(AX) + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B match_is_repeat_encodeBetterBlockAsm10B: - MOVL 12(SP), BP - CMPL BP, SI + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, R8 - LEAL -1(R8), BP - CMPL BP, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c JLT one_byte_match_emit_repeat_encodeBetterBlockAsm10B - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm10B MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B two_bytes_match_emit_repeat_encodeBetterBlockAsm10B: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_match_emit_repeat_encodeBetterBlockAsm10B JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B one_byte_match_emit_repeat_encodeBetterBlockAsm10B: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_match_emit_repeat_encodeBetterBlockAsm10B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_3 - CMPQ R8, $0x08 + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4 + CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_1or2: - MOVB (R9), R10 - MOVB -1(R9)(R8*1), R9 - MOVB R10, (AX) - MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_3: - MOVW (R9), R10 - MOVB 2(R9), R9 - MOVW R10, (AX) - MOVB R9, 2(AX) +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B memmove_long_match_emit_repeat_encodeBetterBlockAsm10B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 + DECQ R13 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B: - ADDL R11, CX - ADDL $0x04, R11 + ADDL R12, CX + ADDL $0x04, R12 MOVL CX, 12(SP) // emitRepeat - MOVL R11, BP - LEAL -4(R11), R11 - CMPL BP, $0x08 + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B - CMPL DI, $0x00000800 + CMPL R8, $0x00000800 JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: - CMPL R11, $0x00000104 + CMPL R12, $0x00000104 JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B - LEAL -256(R11), R11 + LEAL -256(R12), R12 MOVW $0x0019, (AX) - MOVW R11, 2(AX) + MOVW R12, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B: - LEAL -4(R11), R11 + LEAL -4(R12), R12 MOVW $0x0015, (AX) - MOVB R11, 2(AX) + MOVB R12, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: - XORQ BP, BP - LEAL 1(BP)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) ADDQ $0x02, AX match_nolit_emitcopy_end_encodeBetterBlockAsm10B: @@ -8239,36 +7911,53 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm10B: RET match_nolit_dst_ok_encodeBetterBlockAsm10B: - MOVQ $0x0000cf1bbcdcbf9b, BP - MOVQ $0x9e3779b1, DI - INCL SI - MOVQ (DX)(SI*1), R8 - MOVQ R8, R9 - MOVQ R8, R10 - SHRQ $0x08, R10 - LEAL 1(SI), R11 - MOVQ -2(DX)(CX*1), R8 - SHLQ $0x10, R9 - IMULQ BP, R9 - SHRQ $0x34, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x36, R10 - MOVL SI, 24(SP)(R9*4) - MOVL R11, 16408(SP)(R10*4) - MOVQ R8, R9 - MOVQ R8, R10 - SHRQ $0x08, R10 - LEAL -2(CX), R8 - LEAL -1(CX), SI - SHLQ $0x10, R9 - IMULQ BP, R9 - SHRQ $0x34, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x36, R10 - MOVL R8, 24(SP)(R9*4) - MOVL SI, 16408(SP)(R10*4) + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x34, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x34, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x36, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x36, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 16408(SP)(R11*4) + MOVL R15, 16408(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x34, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x36, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x34, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 16408(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) JMP search_loop_encodeBetterBlockAsm10B emit_remainder_encodeBetterBlockAsm10B: @@ -8285,11 +7974,11 @@ emit_remainder_ok_encodeBetterBlockAsm10B: MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B - MOVL CX, BP + MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX - SUBL BX, BP - LEAL -1(BP), DX + SUBL BX, SI + LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBetterBlockAsm10B CMPL DX, $0x00000100 @@ -8313,13 +8002,12 @@ one_byte_emit_remainder_encodeBetterBlockAsm10B: ADDQ $0x01, AX memmove_emit_remainder_encodeBetterBlockAsm10B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3 + CMPQ BX, $0x04 + JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 @@ -8328,31 +8016,22 @@ memmove_emit_remainder_encodeBetterBlockAsm10B: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2: - MOVB (CX), BP - MOVB -1(CX)(BX*1), CL - MOVB BP, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3: - MOVW (CX), BP - MOVB 2(CX), CL - MOVW BP, (AX) - MOVB CL, 2(AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4: + MOVL (CX), SI + MOVL SI, (AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (CX), BP + MOVL (CX), SI MOVL -4(CX)(BX*1), CX - MOVL BP, (AX) + MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B @@ -8378,43 +8057,43 @@ memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B: JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B memmove_long_emit_remainder_encodeBetterBlockAsm10B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 - MOVQ BX, SI - SHRQ $0x05, SI - MOVQ AX, BP - ANDL $0x0000001f, BP - MOVQ $0x00000040, DI - SUBQ BP, DI - DECQ SI + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(CX)(DI*1), BP - LEAQ -32(AX)(DI*1), R8 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (BP), X4 - MOVOU 16(BP), X5 - MOVOA X4, (R8) - MOVOA X5, 16(R8) + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI ADDQ $0x20, R8 - ADDQ $0x20, BP - ADDQ $0x20, DI - DECQ SI + DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(CX)(DI*1), X4 - MOVOU -16(CX)(DI*1), X5 - MOVOA X4, -32(AX)(DI*1) - MOVOA X5, -16(AX)(DI*1) - ADDQ $0x20, DI - CMPQ BX, DI + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) @@ -8451,8 +8130,8 @@ zero_loop_encodeBetterBlockAsm8B: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -6(CX), DX - LEAQ -8(CX), BP - MOVL BP, 8(SP) + LEAQ -8(CX), SI + MOVL SI, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -8462,206 +8141,401 @@ zero_loop_encodeBetterBlockAsm8B: MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm8B: - MOVQ (DX)(CX*1), SI - MOVL CX, BP - SUBL 12(SP), BP - SHRL $0x04, BP - LEAL 1(CX)(BP*1), BP - CMPL BP, 8(SP) + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x04, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) JGE emit_remainder_encodeBetterBlockAsm8B - MOVL BP, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ $0x9e3779b1, BP - MOVQ SI, R9 - MOVQ SI, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 - SHLQ $0x20, R10 - IMULQ BP, R10 - SHRQ $0x38, R10 - MOVL 24(SP)(R9*4), BP - MOVL 4120(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - MOVL CX, 4120(SP)(R10*4) - CMPL (DX)(BP*1), SI + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x38, R11 + MOVL 24(SP)(R10*4), SI + MOVL 4120(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 4120(SP)(R11*4) + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm8B - CMPL (DX)(DI*1), SI + CMPL (DX)(R8*1), DI JEQ candidateS_match_encodeBetterBlockAsm8B MOVL 20(SP), CX JMP search_loop_encodeBetterBlockAsm8B candidateS_match_encodeBetterBlockAsm8B: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 - MOVL 24(SP)(R9*4), BP + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + MOVL 24(SP)(R10*4), SI INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BP*1), SI + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm8B DECL CX - MOVL DI, BP + MOVL R8, SI candidate_match_encodeBetterBlockAsm8B: - MOVL 12(SP), SI - TESTL BP, BP + MOVL 12(SP), DI + TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm8B match_extend_back_loop_encodeBetterBlockAsm8B: - CMPL CX, SI + CMPL CX, DI JLE match_extend_back_end_encodeBetterBlockAsm8B - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(CX*1), DI - CMPB BL, DI + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 JNE match_extend_back_end_encodeBetterBlockAsm8B LEAL -1(CX), CX - DECL BP + DECL SI JZ match_extend_back_end_encodeBetterBlockAsm8B JMP match_extend_back_loop_encodeBetterBlockAsm8B match_extend_back_end_encodeBetterBlockAsm8B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) JL match_dst_size_check_encodeBetterBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBetterBlockAsm8B: - MOVL CX, SI + MOVL CX, DI ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BP*1), R9 + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 // matchLen - XORL R11, R11 - CMPL DI, $0x08 + XORL R12, R12 + CMPL R8, $0x08 JL matchlen_single_match_nolit_encodeBetterBlockAsm8B matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - TESTQ R10, R10 + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 JMP match_nolit_end_encodeBetterBlockAsm8B matchlen_loop_match_nolit_encodeBetterBlockAsm8B: - LEAL -8(DI), DI - LEAL 8(R11), R11 - CMPL DI, $0x08 + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B matchlen_single_match_nolit_encodeBetterBlockAsm8B: - TESTL DI, DI + TESTL R8, R8 JZ match_nolit_end_encodeBetterBlockAsm8B matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B: - MOVB (R8)(R11*1), R10 - CMPB (R9)(R11*1), R10 + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm8B - LEAL 1(R11), R11 - DECL DI + LEAL 1(R12), R12 + DECL R8 JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B match_nolit_end_encodeBetterBlockAsm8B: - MOVL CX, DI - SUBL BP, DI + MOVL CX, R8 + SUBL SI, R8 // Check if repeat - CMPL 16(SP), DI + CMPL 16(SP), R8 JEQ match_is_repeat_encodeBetterBlockAsm8B - MOVL DI, 16(SP) - MOVL 12(SP), BP - CMPL BP, SI + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, R8 - LEAL -1(R8), BP - CMPL BP, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c JLT one_byte_match_emit_encodeBetterBlockAsm8B - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_match_emit_encodeBetterBlockAsm8B MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBetterBlockAsm8B two_bytes_match_emit_encodeBetterBlockAsm8B: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_match_emit_encodeBetterBlockAsm8B JMP memmove_long_match_emit_encodeBetterBlockAsm8B one_byte_match_emit_encodeBetterBlockAsm8B: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBetterBlockAsm8B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R9*1), SI // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_3 - CMPQ R8, $0x08 + CMPQ R9, $0x04 + JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4 + CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_1or2: - MOVB (R9), R10 - MOVB -1(R9)(R8*1), R9 - MOVB R10, (AX) - MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_3: - MOVW (R9), R10 - MOVB 2(R9), R9 - MOVW R10, (AX) - MOVB R9, 2(AX) +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4: + MOVL (R10), R11 + MOVL R11, (AX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7: + MOVL (R10), R11 + MOVL -4(R10)(R9*1), R10 + MOVL R11, (AX) + MOVL R10, -4(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm8B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B + +memmove_long_match_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm8B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBetterBlockAsm8B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + + // emitRepeat + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +emit_copy_three_match_nolit_encodeBetterBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +match_is_repeat_encodeBetterBlockAsm8B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_repeat_encodeBetterBlockAsm8B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B + +two_bytes_match_emit_repeat_encodeBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_repeat_encodeBetterBlockAsm8B + JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B + +one_byte_match_emit_repeat_encodeBetterBlockAsm8B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_repeat_encodeBetterBlockAsm8B: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x04 + JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4 + CMPQ R8, $0x08 + JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4: + MOVL (R9), R10 + MOVL R10, (AX) + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: +emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 @@ -8671,30 +8545,30 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_match_emit_encodeBetterBlockAsm8B: - MOVQ BP, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B +memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B -memmove_long_match_emit_encodeBetterBlockAsm8B: - LEAQ (AX)(R8*1), BP +memmove_long_match_emit_repeat_encodeBetterBlockAsm8B: + LEAQ (AX)(R8*1), SI // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 + MOVQ R8, R11 + SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + DECQ R11 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(AX)(R13*1), R14 -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) @@ -8702,280 +8576,65 @@ emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back + DECQ R11 + JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: +emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) - MOVQ BP, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm8B: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeBetterBlockAsm8B: - CMPL R11, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(R11), R11 - ADDQ $0x03, AX - - // emitRepeat - MOVL R11, BP - LEAL -4(R11), R11 - CMPL BP, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short - CMPL BP, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: - CMPL R11, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - XORQ BP, BP - LEAL 1(BP)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: - CMPL R11, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B - MOVB $0x01, BL - LEAL -16(BX)(R11*4), R11 - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -emit_copy_three_match_nolit_encodeBetterBlockAsm8B: - MOVB $0x02, BL - LEAL -4(BX)(R11*4), R11 - MOVB R11, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -match_is_repeat_encodeBetterBlockAsm8B: - MOVL 12(SP), BP - CMPL BP, SI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B - MOVL SI, DI - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R8 - SUBL BP, DI - LEAL -1(DI), BP - CMPL BP, $0x3c - JLT one_byte_match_emit_repeat_encodeBetterBlockAsm8B - CMPL BP, $0x00000100 - JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm8B - MOVB $0xf4, (AX) - MOVW BP, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B - -two_bytes_match_emit_repeat_encodeBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB BP, 1(AX) - ADDQ $0x02, AX - CMPL BP, $0x40 - JL memmove_match_emit_repeat_encodeBetterBlockAsm8B - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B - -one_byte_match_emit_repeat_encodeBetterBlockAsm8B: - SHLB $0x02, BP - MOVB BP, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm8B: - LEAQ (AX)(DI*1), BP - - // genMemMoveShort - CMPQ DI, $0x03 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_3 - CMPQ DI, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7 - CMPQ DI, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16 - CMPQ DI, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_1or2: - MOVB (R8), R9 - MOVB -1(R8)(DI*1), R8 - MOVB R9, (AX) - MOVB R8, -1(AX)(DI*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_3: - MOVW (R8), R9 - MOVB 2(R8), R8 - MOVW R9, (AX) - MOVB R8, 2(AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7: - MOVL (R8), R9 - MOVL -4(R8)(DI*1), R8 - MOVL R9, (AX) - MOVL R8, -4(AX)(DI*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16: - MOVQ (R8), R9 - MOVQ -8(R8)(DI*1), R8 - MOVQ R9, (AX) - MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(DI*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B: - MOVQ BP, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B - -memmove_long_match_emit_repeat_encodeBetterBlockAsm8B: - LEAQ (AX)(DI*1), BP - - // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R12 - SUBQ R9, R12 - DECQ R10 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R8)(R12*1), R9 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R9 - ADDQ $0x20, R12 - DECQ R10 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R8)(R12*1), X4 - MOVOU -16(R8)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ DI, R12 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - MOVQ BP, AX + MOVQ SI, AX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B: - ADDL R11, CX - ADDL $0x04, R11 + ADDL R12, CX + ADDL $0x04, R12 MOVL CX, 12(SP) // emitRepeat - MOVL R11, BP - LEAL -4(R11), R11 - CMPL BP, $0x08 + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B: - CMPL R11, $0x00000104 + CMPL R12, $0x00000104 JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B - LEAL -256(R11), R11 + LEAL -256(R12), R12 MOVW $0x0019, (AX) - MOVW R11, 2(AX) + MOVW R12, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B: - LEAL -4(R11), R11 + LEAL -4(R12), R12 MOVW $0x0015, (AX) - MOVB R11, 2(AX) + MOVB R12, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - XORQ BP, BP - LEAL 1(BP)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) ADDQ $0x02, AX match_nolit_emitcopy_end_encodeBetterBlockAsm8B: @@ -8987,36 +8646,53 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm8B: RET match_nolit_dst_ok_encodeBetterBlockAsm8B: - MOVQ $0x0000cf1bbcdcbf9b, BP - MOVQ $0x9e3779b1, DI - INCL SI - MOVQ (DX)(SI*1), R8 - MOVQ R8, R9 - MOVQ R8, R10 - SHRQ $0x08, R10 - LEAL 1(SI), R11 - MOVQ -2(DX)(CX*1), R8 - SHLQ $0x10, R9 - IMULQ BP, R9 - SHRQ $0x36, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x38, R10 - MOVL SI, 24(SP)(R9*4) - MOVL R11, 4120(SP)(R10*4) - MOVQ R8, R9 - MOVQ R8, R10 - SHRQ $0x08, R10 - LEAL -2(CX), R8 - LEAL -1(CX), SI - SHLQ $0x10, R9 - IMULQ BP, R9 - SHRQ $0x36, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x38, R10 - MOVL R8, 24(SP)(R9*4) - MOVL SI, 4120(SP)(R10*4) + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x36, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x36, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x38, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x38, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 4120(SP)(R11*4) + MOVL R15, 4120(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x38, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x36, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 4120(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) JMP search_loop_encodeBetterBlockAsm8B emit_remainder_encodeBetterBlockAsm8B: @@ -9033,11 +8709,11 @@ emit_remainder_ok_encodeBetterBlockAsm8B: MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B - MOVL CX, BP + MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX - SUBL BX, BP - LEAL -1(BP), DX + SUBL BX, SI + LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBetterBlockAsm8B CMPL DX, $0x00000100 @@ -9061,13 +8737,12 @@ one_byte_emit_remainder_encodeBetterBlockAsm8B: ADDQ $0x01, AX memmove_emit_remainder_encodeBetterBlockAsm8B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3 + CMPQ BX, $0x04 + JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 @@ -9076,31 +8751,22 @@ memmove_emit_remainder_encodeBetterBlockAsm8B: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2: - MOVB (CX), BP - MOVB -1(CX)(BX*1), CL - MOVB BP, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3: - MOVW (CX), BP - MOVB 2(CX), CL - MOVW BP, (AX) - MOVB CL, 2(AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4: + MOVL (CX), SI + MOVL SI, (AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: - MOVL (CX), BP + MOVL (CX), SI MOVL -4(CX)(BX*1), CX - MOVL BP, (AX) + MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B @@ -9126,43 +8792,43 @@ memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B: JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B memmove_long_emit_remainder_encodeBetterBlockAsm8B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 - MOVQ BX, SI - SHRQ $0x05, SI - MOVQ AX, BP - ANDL $0x0000001f, BP - MOVQ $0x00000040, DI - SUBQ BP, DI - DECQ SI + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(CX)(DI*1), BP - LEAQ -32(AX)(DI*1), R8 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: - MOVOU (BP), X4 - MOVOU 16(BP), X5 - MOVOA X4, (R8) - MOVOA X5, 16(R8) + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI ADDQ $0x20, R8 - ADDQ $0x20, BP - ADDQ $0x20, DI - DECQ SI + DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(CX)(DI*1), X4 - MOVOU -16(CX)(DI*1), X5 - MOVOA X4, -32(AX)(DI*1) - MOVOA X5, -16(AX)(DI*1) - ADDQ $0x20, DI - CMPQ BX, DI + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) @@ -9198,9 +8864,9 @@ zero_loop_encodeSnappyBlockAsm: JNZ zero_loop_encodeSnappyBlockAsm MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX - LEAQ -5(CX), DX - LEAQ -8(CX), BP - MOVL BP, 8(SP) + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -9210,318 +8876,299 @@ zero_loop_encodeSnappyBlockAsm: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm: - MOVQ (DX)(CX*1), SI - MOVL CX, BP - SUBL 12(SP), BP - SHRL $0x06, BP - LEAL 4(CX)(BP*1), BP - CMPL BP, 8(SP) + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm - MOVL BP, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 SHLQ $0x10, R10 - IMULQ R8, R10 + IMULQ R9, R10 SHRQ $0x32, R10 - MOVL 24(SP)(R9*4), BP - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 + SHLQ $0x10, R11 + IMULQ R9, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 JNE no_repeat_found_encodeSnappyBlockAsm - LEAL 1(CX), SI - MOVL 12(SP), BP - MOVL SI, DI - SUBL 16(SP), DI + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeSnappyBlockAsm repeat_extend_back_loop_encodeSnappyBlockAsm: - CMPL SI, BP + CMPL DI, SI JLE repeat_extend_back_end_encodeSnappyBlockAsm - MOVB -1(DX)(DI*1), BL - MOVB -1(DX)(SI*1), R8 - CMPB BL, R8 + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 JNE repeat_extend_back_end_encodeSnappyBlockAsm - LEAL -1(SI), SI - DECL DI + LEAL -1(DI), DI + DECL R8 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm repeat_extend_back_end_encodeSnappyBlockAsm: - MOVL 12(SP), BP - CMPL BP, SI + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm - MOVL SI, DI - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R8 - SUBL BP, DI - LEAL -1(DI), BP - CMPL BP, $0x3c + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c JLT one_byte_repeat_emit_encodeSnappyBlockAsm - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_repeat_emit_encodeSnappyBlockAsm - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JLT three_bytes_repeat_emit_encodeSnappyBlockAsm - CMPL BP, $0x01000000 + CMPL SI, $0x01000000 JLT four_bytes_repeat_emit_encodeSnappyBlockAsm MOVB $0xfc, (AX) - MOVL BP, 1(AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm four_bytes_repeat_emit_encodeSnappyBlockAsm: - MOVL BP, R9 - SHRL $0x10, R9 + MOVL SI, R10 + SHRL $0x10, R10 MOVB $0xf8, (AX) - MOVW BP, 1(AX) - MOVB R9, 3(AX) + MOVW SI, 1(AX) + MOVB R10, 3(AX) ADDQ $0x04, AX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm three_bytes_repeat_emit_encodeSnappyBlockAsm: MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm two_bytes_repeat_emit_encodeSnappyBlockAsm: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_repeat_emit_encodeSnappyBlockAsm JMP memmove_long_repeat_emit_encodeSnappyBlockAsm one_byte_repeat_emit_encodeSnappyBlockAsm: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm: - LEAQ (AX)(DI*1), BP + LEAQ (AX)(R8*1), SI // genMemMoveShort - CMPQ DI, $0x03 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_3 - CMPQ DI, $0x08 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_4through7 - CMPQ DI, $0x10 + CMPQ R8, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8 + CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ DI, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_1or2: - MOVB (R8), R9 - MOVB -1(R8)(DI*1), R8 - MOVB R9, (AX) - MOVB R8, -1(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_3: - MOVW (R8), R9 - MOVB 2(R8), R8 - MOVW R9, (AX) - MOVB R8, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_4through7: - MOVL (R8), R9 - MOVL -4(R8)(DI*1), R8 - MOVL R9, (AX) - MOVL R8, -4(AX)(DI*1) +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8: + MOVQ (R9), R10 + MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (R8), R9 - MOVQ -8(R8)(DI*1), R8 - MOVQ R9, (AX) - MOVQ R8, -8(AX)(DI*1) + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(DI*1), X1 + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(DI*1) + MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm memmove_long_repeat_emit_encodeSnappyBlockAsm: - LEAQ (AX)(DI*1), BP + LEAQ (AX)(R8*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 + DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R8)(R11*1), X4 - MOVOU -16(R8)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ DI, R11 + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm: ADDL $0x05, CX - MOVL CX, BP - SUBL 16(SP), BP - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BP*1), BP + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI // matchLen - XORL R10, R10 - CMPL DI, $0x08 + XORL R11, R11 + CMPL R8, $0x08 JL matchlen_single_repeat_extend_encodeSnappyBlockAsm matchlen_loopback_repeat_extend_encodeSnappyBlockAsm: - MOVQ (R8)(R10*1), R9 - XORQ (BP)(R10*1), R9 - TESTQ R9, R9 + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeSnappyBlockAsm matchlen_loop_repeat_extend_encodeSnappyBlockAsm: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm matchlen_single_repeat_extend_encodeSnappyBlockAsm: - TESTL DI, DI + TESTL R8, R8 JZ repeat_extend_forward_end_encodeSnappyBlockAsm matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm: - MOVB (R8)(R10*1), R9 - CMPB (BP)(R10*1), R9 + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm - LEAL 1(R10), R10 - DECL DI + LEAL 1(R11), R11 + DECL R8 JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm repeat_extend_forward_end_encodeSnappyBlockAsm: - ADDL R10, CX - MOVL CX, BP - SUBL SI, BP - MOVL 16(SP), SI + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI // emitCopy - CMPL SI, $0x00010000 + CMPL DI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm: - CMPL BP, $0x40 + CMPL SI, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm MOVB $0xff, (AX) - MOVL SI, 1(AX) - LEAL -64(BP), BP + MOVL DI, 1(AX) + LEAL -64(SI), SI ADDQ $0x05, AX - CMPL BP, $0x04 + CMPL SI, $0x04 JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm: - TESTL BP, BP + TESTL SI, SI JZ repeat_end_emit_encodeSnappyBlockAsm MOVB $0x03, BL - LEAL -4(BX)(BP*4), BP - MOVB BP, (AX) - MOVL SI, 1(AX) + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVL DI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeSnappyBlockAsm two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm: - CMPL BP, $0x40 + CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BP), BP + MOVW DI, 1(AX) + LEAL -60(SI), SI ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm: - CMPL BP, $0x0c + CMPL SI, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm MOVB $0x01, BL - LEAL -16(BX)(BP*4), BP - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm: MOVB $0x02, BL - LEAL -4(BX)(BP*4), BP - MOVB BP, (AX) - MOVW SI, 1(AX) + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm: @@ -9529,16 +9176,16 @@ repeat_end_emit_encodeSnappyBlockAsm: JMP search_loop_encodeSnappyBlockAsm no_repeat_found_encodeSnappyBlockAsm: - CMPL (DX)(BP*1), SI + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeSnappyBlockAsm - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BP - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI JEQ candidate2_match_encodeSnappyBlockAsm - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BP*1), SI + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI JEQ candidate3_match_encodeSnappyBlockAsm MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm @@ -9548,328 +9195,309 @@ candidate3_match_encodeSnappyBlockAsm: JMP candidate_match_encodeSnappyBlockAsm candidate2_match_encodeSnappyBlockAsm: - MOVL R8, 24(SP)(R9*4) + MOVL R9, 24(SP)(R10*4) INCL CX - MOVL DI, BP + MOVL R8, SI candidate_match_encodeSnappyBlockAsm: - MOVL 12(SP), SI - TESTL BP, BP + MOVL 12(SP), DI + TESTL SI, SI JZ match_extend_back_end_encodeSnappyBlockAsm match_extend_back_loop_encodeSnappyBlockAsm: - CMPL CX, SI + CMPL CX, DI JLE match_extend_back_end_encodeSnappyBlockAsm - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(CX*1), DI - CMPB BL, DI + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 JNE match_extend_back_end_encodeSnappyBlockAsm LEAL -1(CX), CX - DECL BP + DECL SI JZ match_extend_back_end_encodeSnappyBlockAsm JMP match_extend_back_loop_encodeSnappyBlockAsm match_extend_back_end_encodeSnappyBlockAsm: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 5(AX)(SI*1), SI - CMPQ SI, (SP) + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 5(AX)(DI*1), DI + CMPQ DI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c JLT one_byte_match_emit_encodeSnappyBlockAsm - CMPL DI, $0x00000100 + CMPL R8, $0x00000100 JLT two_bytes_match_emit_encodeSnappyBlockAsm - CMPL DI, $0x00010000 + CMPL R8, $0x00010000 JLT three_bytes_match_emit_encodeSnappyBlockAsm - CMPL DI, $0x01000000 + CMPL R8, $0x01000000 JLT four_bytes_match_emit_encodeSnappyBlockAsm MOVB $0xfc, (AX) - MOVL DI, 1(AX) + MOVL R8, 1(AX) ADDQ $0x05, AX JMP memmove_long_match_emit_encodeSnappyBlockAsm four_bytes_match_emit_encodeSnappyBlockAsm: - MOVL DI, R9 - SHRL $0x10, R9 + MOVL R8, R10 + SHRL $0x10, R10 MOVB $0xf8, (AX) - MOVW DI, 1(AX) - MOVB R9, 3(AX) + MOVW R8, 1(AX) + MOVB R10, 3(AX) ADDQ $0x04, AX JMP memmove_long_match_emit_encodeSnappyBlockAsm three_bytes_match_emit_encodeSnappyBlockAsm: MOVB $0xf4, (AX) - MOVW DI, 1(AX) + MOVW R8, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeSnappyBlockAsm two_bytes_match_emit_encodeSnappyBlockAsm: MOVB $0xf0, (AX) - MOVB DI, 1(AX) + MOVB R8, 1(AX) ADDQ $0x02, AX - CMPL DI, $0x40 + CMPL R8, $0x40 JL memmove_match_emit_encodeSnappyBlockAsm JMP memmove_long_match_emit_encodeSnappyBlockAsm one_byte_match_emit_encodeSnappyBlockAsm: - SHLB $0x02, DI - MOVB DI, (AX) + SHLB $0x02, R8 + MOVB R8, (AX) ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_1or2: - MOVB (SI), R9 - MOVB -1(SI)(R8*1), SI - MOVB R9, (AX) - MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_3: - MOVW (SI), R9 - MOVB 2(SI), SI - MOVW R9, (AX) - MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_4through7: - MOVL (SI), R9 - MOVL -4(SI)(R8*1), SI - MOVL R9, (AX) - MOVL SI, -4(AX)(R8*1) +emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_encodeSnappyBlockAsm: - MOVQ DI, AX + MOVQ R8, AX JMP emit_literal_done_match_emit_encodeSnappyBlockAsm memmove_long_match_emit_encodeSnappyBlockAsm: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 + DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX emit_literal_done_match_emit_encodeSnappyBlockAsm: match_nolit_loop_encodeSnappyBlockAsm: - MOVL CX, SI - SUBL BP, SI - MOVL SI, 16(SP) + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), BP + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI // matchLen - XORL R9, R9 - CMPL SI, $0x08 + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsm matchlen_loopback_match_nolit_encodeSnappyBlockAsm: - MOVQ (DI)(R9*1), R8 - XORQ (BP)(R9*1), R8 - TESTQ R8, R8 + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm - BSFQ R8, R8 - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeSnappyBlockAsm matchlen_loop_match_nolit_encodeSnappyBlockAsm: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm matchlen_single_match_nolit_encodeSnappyBlockAsm: - TESTL SI, SI + TESTL DI, DI JZ match_nolit_end_encodeSnappyBlockAsm matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm: - MOVB (DI)(R9*1), R8 - CMPB (BP)(R9*1), R8 + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm - LEAL 1(R9), R9 - DECL SI + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm match_nolit_end_encodeSnappyBlockAsm: - ADDL R9, CX - MOVL 16(SP), BP - ADDL $0x04, R9 + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 MOVL CX, 12(SP) // emitCopy - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_match_nolit_encodeSnappyBlockAsm four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm: - CMPL R9, $0x40 + CMPL R10, $0x40 JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(R9), R9 + MOVL SI, 1(AX) + LEAL -64(R10), R10 ADDQ $0x05, AX - CMPL R9, $0x04 + CMPL R10, $0x04 JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm four_bytes_remain_match_nolit_encodeSnappyBlockAsm: - TESTL R9, R9 + TESTL R10, R10 JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm MOVB $0x03, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) - MOVL BP, 1(AX) + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm two_byte_offset_match_nolit_encodeSnappyBlockAsm: - CMPL R9, $0x40 + CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(R9), R9 + MOVW SI, 1(AX) + LEAL -60(R10), R10 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm two_byte_offset_short_match_nolit_encodeSnappyBlockAsm: - CMPL R9, $0x0c + CMPL R10, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm MOVB $0x01, BL - LEAL -16(BX)(R9*4), R9 - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm emit_copy_three_match_nolit_encodeSnappyBlockAsm: MOVB $0x02, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) - MOVW BP, 1(AX) + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeSnappyBlockAsm: CMPL CX, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm - MOVQ -2(DX)(CX*1), SI + MOVQ -2(DX)(CX*1), DI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeSnappyBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeSnappyBlockAsm: - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BP - SHLQ $0x10, DI - IMULQ R8, DI - SHRQ $0x32, DI - SHLQ $0x10, BP - IMULQ R8, BP - SHRQ $0x32, BP - LEAL -2(CX), R8 - LEAQ 24(SP)(BP*4), R9 - MOVL (R9), BP - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BP*1), SI + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x32, R8 + SHLQ $0x10, SI + IMULQ R9, SI + SHRQ $0x32, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI JEQ match_nolit_loop_encodeSnappyBlockAsm INCL CX JMP search_loop_encodeSnappyBlockAsm @@ -9888,11 +9516,11 @@ emit_remainder_ok_encodeSnappyBlockAsm: MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm - MOVL CX, BP + MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX - SUBL BX, BP - LEAL -1(BP), DX + SUBL BX, SI + LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeSnappyBlockAsm CMPL DX, $0x00000100 @@ -9935,46 +9563,27 @@ one_byte_emit_remainder_encodeSnappyBlockAsm: ADDQ $0x01, AX memmove_emit_remainder_encodeSnappyBlockAsm: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2: - MOVB (CX), BP - MOVB -1(CX)(BX*1), CL - MOVB BP, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3: - MOVW (CX), BP - MOVB 2(CX), CL - MOVW BP, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7: - MOVL (CX), BP - MOVL -4(CX)(BX*1), CX - MOVL BP, (AX) - MOVL CX, -4(AX)(BX*1) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm @@ -10000,43 +9609,43 @@ memmove_end_copy_emit_remainder_encodeSnappyBlockAsm: JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm memmove_long_emit_remainder_encodeSnappyBlockAsm: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 - MOVQ BX, SI - SHRQ $0x05, SI - MOVQ AX, BP - ANDL $0x0000001f, BP - MOVQ $0x00000040, DI - SUBQ BP, DI - DECQ SI + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(CX)(DI*1), BP - LEAQ -32(AX)(DI*1), R8 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (BP), X4 - MOVOU 16(BP), X5 - MOVOA X4, (R8) - MOVOA X5, 16(R8) + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI ADDQ $0x20, R8 - ADDQ $0x20, BP - ADDQ $0x20, DI - DECQ SI + DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(CX)(DI*1), X4 - MOVOU -16(CX)(DI*1), X5 - MOVOA X4, -32(AX)(DI*1) - MOVOA X5, -16(AX)(DI*1) - ADDQ $0x20, DI - CMPQ BX, DI + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) @@ -10050,6 +9659,718 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm: MOVQ AX, ret+48(FP) RET +// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000200, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBlockAsm64K: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBlockAsm64K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBlockAsm64K: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm64K + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + SHLQ $0x10, R11 + IMULQ R9, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeSnappyBlockAsm64K + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeSnappyBlockAsm64K + +repeat_extend_back_loop_encodeSnappyBlockAsm64K: + CMPL DI, SI + JLE repeat_extend_back_end_encodeSnappyBlockAsm64K + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeSnappyBlockAsm64K + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K + +repeat_extend_back_end_encodeSnappyBlockAsm64K: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeSnappyBlockAsm64K + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeSnappyBlockAsm64K + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K + +two_bytes_repeat_emit_encodeSnappyBlockAsm64K: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeSnappyBlockAsm64K + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K + +one_byte_repeat_emit_encodeSnappyBlockAsm64K: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeSnappyBlockAsm64K: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8: + MOVQ (R9), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K + +memmove_long_repeat_emit_encodeSnappyBlockAsm64K: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K: + ADDL $0x05, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeSnappyBlockAsm64K + +matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K + +matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K + +matchlen_single_repeat_extend_encodeSnappyBlockAsm64K: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K + +matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K + +repeat_extend_forward_end_encodeSnappyBlockAsm64K: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitCopy +two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K: + CMPL SI, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K + MOVB $0xee, (AX) + MOVW DI, 1(AX) + LEAL -60(SI), SI + ADDQ $0x03, AX + JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K + +two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K: + CMPL SI, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K + CMPL DI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K + MOVB $0x01, BL + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeSnappyBlockAsm64K + +emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K: + MOVB $0x02, BL + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeSnappyBlockAsm64K: + MOVL CX, 12(SP) + JMP search_loop_encodeSnappyBlockAsm64K + +no_repeat_found_encodeSnappyBlockAsm64K: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBlockAsm64K + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI + JEQ candidate2_match_encodeSnappyBlockAsm64K + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI + JEQ candidate3_match_encodeSnappyBlockAsm64K + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBlockAsm64K + +candidate3_match_encodeSnappyBlockAsm64K: + ADDL $0x02, CX + JMP candidate_match_encodeSnappyBlockAsm64K + +candidate2_match_encodeSnappyBlockAsm64K: + MOVL R9, 24(SP)(R10*4) + INCL CX + MOVL R8, SI + +candidate_match_encodeSnappyBlockAsm64K: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBlockAsm64K + +match_extend_back_loop_encodeSnappyBlockAsm64K: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBlockAsm64K + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBlockAsm64K + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBlockAsm64K + JMP match_extend_back_loop_encodeSnappyBlockAsm64K + +match_extend_back_end_encodeSnappyBlockAsm64K: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBlockAsm64K: + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI + JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c + JLT one_byte_match_emit_encodeSnappyBlockAsm64K + CMPL R8, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBlockAsm64K + MOVB $0xf4, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBlockAsm64K + +two_bytes_match_emit_encodeSnappyBlockAsm64K: + MOVB $0xf0, (AX) + MOVB R8, 1(AX) + ADDQ $0x02, AX + CMPL R8, $0x40 + JL memmove_match_emit_encodeSnappyBlockAsm64K + JMP memmove_long_match_emit_encodeSnappyBlockAsm64K + +one_byte_match_emit_encodeSnappyBlockAsm64K: + SHLB $0x02, R8 + MOVB R8, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBlockAsm64K: + LEAQ (AX)(R9*1), R8 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBlockAsm64K: + MOVQ R8, AX + JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K + +memmove_long_match_emit_encodeSnappyBlockAsm64K: + LEAQ (AX)(R9*1), R8 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX + +emit_literal_done_match_emit_encodeSnappyBlockAsm64K: +match_nolit_loop_encodeSnappyBlockAsm64K: + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBlockAsm64K + +matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K: + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm64K + +matchlen_loop_match_nolit_encodeSnappyBlockAsm64K: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K + +matchlen_single_match_nolit_encodeSnappyBlockAsm64K: + TESTL DI, DI + JZ match_nolit_end_encodeSnappyBlockAsm64K + +matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K: + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 + JNE match_nolit_end_encodeSnappyBlockAsm64K + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K + +match_nolit_end_encodeSnappyBlockAsm64K: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBlockAsm64K: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K + +two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K + +emit_copy_three_match_nolit_encodeSnappyBlockAsm64K: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBlockAsm64K: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBlockAsm64K + MOVQ -2(DX)(CX*1), DI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBlockAsm64K: + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x32, R8 + SHLQ $0x10, SI + IMULQ R9, SI + SHRQ $0x32, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI + JEQ match_nolit_loop_encodeSnappyBlockAsm64K + INCL CX + JMP search_loop_encodeSnappyBlockAsm64K + +emit_remainder_encodeSnappyBlockAsm64K: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBlockAsm64K: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBlockAsm64K + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBlockAsm64K + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K + +two_bytes_emit_remainder_encodeSnappyBlockAsm64K: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBlockAsm64K + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K + +one_byte_emit_remainder_encodeSnappyBlockAsm64K: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBlockAsm64K: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K + +memmove_long_emit_remainder_encodeSnappyBlockAsm64K: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + // func encodeSnappyBlockAsm12B(dst []byte, src []byte) int // Requires: SSE2 TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56 @@ -10072,9 +10393,9 @@ zero_loop_encodeSnappyBlockAsm12B: JNZ zero_loop_encodeSnappyBlockAsm12B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX - LEAQ -5(CX), DX - LEAQ -8(CX), BP - MOVL BP, 8(SP) + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -10084,275 +10405,256 @@ zero_loop_encodeSnappyBlockAsm12B: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm12B: - MOVQ (DX)(CX*1), SI - MOVL CX, BP - SUBL 12(SP), BP - SHRL $0x05, BP - LEAL 4(CX)(BP*1), BP - CMPL BP, 8(SP) + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm12B - MOVL BP, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x18, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x000000cf1bbcdcbb, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 SHLQ $0x18, R10 - IMULQ R8, R10 + IMULQ R9, R10 SHRQ $0x34, R10 - MOVL 24(SP)(R9*4), BP - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x18, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 + SHLQ $0x18, R11 + IMULQ R9, R11 + SHRQ $0x34, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x18, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 JNE no_repeat_found_encodeSnappyBlockAsm12B - LEAL 1(CX), SI - MOVL 12(SP), BP - MOVL SI, DI - SUBL 16(SP), DI + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeSnappyBlockAsm12B repeat_extend_back_loop_encodeSnappyBlockAsm12B: - CMPL SI, BP + CMPL DI, SI JLE repeat_extend_back_end_encodeSnappyBlockAsm12B - MOVB -1(DX)(DI*1), BL - MOVB -1(DX)(SI*1), R8 - CMPB BL, R8 + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 JNE repeat_extend_back_end_encodeSnappyBlockAsm12B - LEAL -1(SI), SI - DECL DI + LEAL -1(DI), DI + DECL R8 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B repeat_extend_back_end_encodeSnappyBlockAsm12B: - MOVL 12(SP), BP - CMPL BP, SI + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B - MOVL SI, DI - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R8 - SUBL BP, DI - LEAL -1(DI), BP - CMPL BP, $0x3c + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c JLT one_byte_repeat_emit_encodeSnappyBlockAsm12B - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_repeat_emit_encodeSnappyBlockAsm12B MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B two_bytes_repeat_emit_encodeSnappyBlockAsm12B: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_repeat_emit_encodeSnappyBlockAsm12B JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B one_byte_repeat_emit_encodeSnappyBlockAsm12B: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(DI*1), BP + LEAQ (AX)(R8*1), SI // genMemMoveShort - CMPQ DI, $0x03 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_3 - CMPQ DI, $0x08 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_4through7 - CMPQ DI, $0x10 + CMPQ R8, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8 + CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ DI, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_1or2: - MOVB (R8), R9 - MOVB -1(R8)(DI*1), R8 - MOVB R9, (AX) - MOVB R8, -1(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_3: - MOVW (R8), R9 - MOVB 2(R8), R8 - MOVW R9, (AX) - MOVB R8, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_4through7: - MOVL (R8), R9 - MOVL -4(R8)(DI*1), R8 - MOVL R9, (AX) - MOVL R8, -4(AX)(DI*1) +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8: + MOVQ (R9), R10 + MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (R8), R9 - MOVQ -8(R8)(DI*1), R8 - MOVQ R9, (AX) - MOVQ R8, -8(AX)(DI*1) + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(DI*1), X1 + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(DI*1) + MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B memmove_long_repeat_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(DI*1), BP + LEAQ (AX)(R8*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 + DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R8)(R11*1), X4 - MOVOU -16(R8)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ DI, R11 + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: ADDL $0x05, CX - MOVL CX, BP - SUBL 16(SP), BP - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BP*1), BP + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI // matchLen - XORL R10, R10 - CMPL DI, $0x08 + XORL R11, R11 + CMPL R8, $0x08 JL matchlen_single_repeat_extend_encodeSnappyBlockAsm12B matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: - MOVQ (R8)(R10*1), R9 - XORQ (BP)(R10*1), R9 - TESTQ R9, R9 + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B matchlen_single_repeat_extend_encodeSnappyBlockAsm12B: - TESTL DI, DI + TESTL R8, R8 JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B: - MOVB (R8)(R10*1), R9 - CMPB (BP)(R10*1), R9 + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B - LEAL 1(R10), R10 - DECL DI + LEAL 1(R11), R11 + DECL R8 JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B repeat_extend_forward_end_encodeSnappyBlockAsm12B: - ADDL R10, CX - MOVL CX, BP - SUBL SI, BP - MOVL 16(SP), SI + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI // emitCopy two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B: - CMPL BP, $0x40 + CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BP), BP + MOVW DI, 1(AX) + LEAL -60(SI), SI ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B: - CMPL BP, $0x0c + CMPL SI, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B MOVB $0x01, BL - LEAL -16(BX)(BP*4), BP - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm12B emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B: MOVB $0x02, BL - LEAL -4(BX)(BP*4), BP - MOVB BP, (AX) - MOVW SI, 1(AX) + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm12B: @@ -10360,16 +10662,16 @@ repeat_end_emit_encodeSnappyBlockAsm12B: JMP search_loop_encodeSnappyBlockAsm12B no_repeat_found_encodeSnappyBlockAsm12B: - CMPL (DX)(BP*1), SI + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeSnappyBlockAsm12B - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BP - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI JEQ candidate2_match_encodeSnappyBlockAsm12B - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BP*1), SI + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI JEQ candidate3_match_encodeSnappyBlockAsm12B MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm12B @@ -10379,285 +10681,266 @@ candidate3_match_encodeSnappyBlockAsm12B: JMP candidate_match_encodeSnappyBlockAsm12B candidate2_match_encodeSnappyBlockAsm12B: - MOVL R8, 24(SP)(R9*4) + MOVL R9, 24(SP)(R10*4) INCL CX - MOVL DI, BP + MOVL R8, SI candidate_match_encodeSnappyBlockAsm12B: - MOVL 12(SP), SI - TESTL BP, BP + MOVL 12(SP), DI + TESTL SI, SI JZ match_extend_back_end_encodeSnappyBlockAsm12B match_extend_back_loop_encodeSnappyBlockAsm12B: - CMPL CX, SI + CMPL CX, DI JLE match_extend_back_end_encodeSnappyBlockAsm12B - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(CX*1), DI - CMPB BL, DI + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 JNE match_extend_back_end_encodeSnappyBlockAsm12B LEAL -1(CX), CX - DECL BP + DECL SI JZ match_extend_back_end_encodeSnappyBlockAsm12B JMP match_extend_back_loop_encodeSnappyBlockAsm12B match_extend_back_end_encodeSnappyBlockAsm12B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm12B: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c JLT one_byte_match_emit_encodeSnappyBlockAsm12B - CMPL DI, $0x00000100 + CMPL R8, $0x00000100 JLT two_bytes_match_emit_encodeSnappyBlockAsm12B MOVB $0xf4, (AX) - MOVW DI, 1(AX) + MOVW R8, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeSnappyBlockAsm12B two_bytes_match_emit_encodeSnappyBlockAsm12B: MOVB $0xf0, (AX) - MOVB DI, 1(AX) + MOVB R8, 1(AX) ADDQ $0x02, AX - CMPL DI, $0x40 + CMPL R8, $0x40 JL memmove_match_emit_encodeSnappyBlockAsm12B JMP memmove_long_match_emit_encodeSnappyBlockAsm12B one_byte_match_emit_encodeSnappyBlockAsm12B: - SHLB $0x02, DI - MOVB DI, (AX) + SHLB $0x02, R8 + MOVB R8, (AX) ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_1or2: - MOVB (SI), R9 - MOVB -1(SI)(R8*1), SI - MOVB R9, (AX) - MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_3: - MOVW (SI), R9 - MOVB 2(SI), SI - MOVW R9, (AX) - MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_4through7: - MOVL (SI), R9 - MOVL -4(SI)(R8*1), SI - MOVL R9, (AX) - MOVL SI, -4(AX)(R8*1) +emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_encodeSnappyBlockAsm12B: - MOVQ DI, AX + MOVQ R8, AX JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B memmove_long_match_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 + DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX emit_literal_done_match_emit_encodeSnappyBlockAsm12B: match_nolit_loop_encodeSnappyBlockAsm12B: - MOVL CX, SI - SUBL BP, SI - MOVL SI, 16(SP) + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), BP + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI // matchLen - XORL R9, R9 - CMPL SI, $0x08 + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsm12B matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: - MOVQ (DI)(R9*1), R8 - XORQ (BP)(R9*1), R8 - TESTQ R8, R8 + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B - BSFQ R8, R8 - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeSnappyBlockAsm12B matchlen_loop_match_nolit_encodeSnappyBlockAsm12B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B matchlen_single_match_nolit_encodeSnappyBlockAsm12B: - TESTL SI, SI + TESTL DI, DI JZ match_nolit_end_encodeSnappyBlockAsm12B matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B: - MOVB (DI)(R9*1), R8 - CMPB (BP)(R9*1), R8 + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm12B - LEAL 1(R9), R9 - DECL SI + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B match_nolit_end_encodeSnappyBlockAsm12B: - ADDL R9, CX - MOVL 16(SP), BP - ADDL $0x04, R9 + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBlockAsm12B: - CMPL R9, $0x40 + CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(R9), R9 + MOVW SI, 1(AX) + LEAL -60(R10), R10 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B: - CMPL R9, $0x0c + CMPL R10, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B MOVB $0x01, BL - LEAL -16(BX)(R9*4), R9 - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B emit_copy_three_match_nolit_encodeSnappyBlockAsm12B: MOVB $0x02, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) - MOVW BP, 1(AX) + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeSnappyBlockAsm12B: CMPL CX, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm12B - MOVQ -2(DX)(CX*1), SI + MOVQ -2(DX)(CX*1), DI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeSnappyBlockAsm12B: - MOVQ $0x000000cf1bbcdcbb, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BP - SHLQ $0x18, DI - IMULQ R8, DI - SHRQ $0x34, DI - SHLQ $0x18, BP - IMULQ R8, BP - SHRQ $0x34, BP - LEAL -2(CX), R8 - LEAQ 24(SP)(BP*4), R9 - MOVL (R9), BP - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BP*1), SI + MOVQ $0x000000cf1bbcdcbb, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x18, R8 + IMULQ R9, R8 + SHRQ $0x34, R8 + SHLQ $0x18, SI + IMULQ R9, SI + SHRQ $0x34, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI JEQ match_nolit_loop_encodeSnappyBlockAsm12B INCL CX JMP search_loop_encodeSnappyBlockAsm12B @@ -10676,11 +10959,11 @@ emit_remainder_ok_encodeSnappyBlockAsm12B: MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B - MOVL CX, BP + MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX - SUBL BX, BP - LEAL -1(BP), DX + SUBL BX, SI + LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeSnappyBlockAsm12B CMPL DX, $0x00000100 @@ -10704,46 +10987,27 @@ one_byte_emit_remainder_encodeSnappyBlockAsm12B: ADDQ $0x01, AX memmove_emit_remainder_encodeSnappyBlockAsm12B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2: - MOVB (CX), BP - MOVB -1(CX)(BX*1), CL - MOVB BP, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3: - MOVW (CX), BP - MOVB 2(CX), CL - MOVW BP, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7: - MOVL (CX), BP - MOVL -4(CX)(BX*1), CX - MOVL BP, (AX) - MOVL CX, -4(AX)(BX*1) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B @@ -10769,43 +11033,43 @@ memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B: JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B memmove_long_emit_remainder_encodeSnappyBlockAsm12B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 - MOVQ BX, SI - SHRQ $0x05, SI - MOVQ AX, BP - ANDL $0x0000001f, BP - MOVQ $0x00000040, DI - SUBQ BP, DI - DECQ SI + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(CX)(DI*1), BP - LEAQ -32(AX)(DI*1), R8 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (BP), X4 - MOVOU 16(BP), X5 - MOVOA X4, (R8) - MOVOA X5, 16(R8) + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI ADDQ $0x20, R8 - ADDQ $0x20, BP - ADDQ $0x20, DI - DECQ SI + DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(CX)(DI*1), X4 - MOVOU -16(CX)(DI*1), X5 - MOVOA X4, -32(AX)(DI*1) - MOVOA X5, -16(AX)(DI*1) - ADDQ $0x20, DI - CMPQ BX, DI + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) @@ -10841,9 +11105,9 @@ zero_loop_encodeSnappyBlockAsm10B: JNZ zero_loop_encodeSnappyBlockAsm10B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX - LEAQ -5(CX), DX - LEAQ -8(CX), BP - MOVL BP, 8(SP) + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -10853,275 +11117,256 @@ zero_loop_encodeSnappyBlockAsm10B: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm10B: - MOVQ (DX)(CX*1), SI - MOVL CX, BP - SUBL 12(SP), BP - SHRL $0x05, BP - LEAL 4(CX)(BP*1), BP - CMPL BP, 8(SP) + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm10B - MOVL BP, 20(SP) - MOVQ $0x9e3779b1, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x9e3779b1, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 SHLQ $0x20, R10 - IMULQ R8, R10 + IMULQ R9, R10 SHRQ $0x36, R10 - MOVL 24(SP)(R9*4), BP - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 + SHLQ $0x20, R11 + IMULQ R9, R11 + SHRQ $0x36, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 JNE no_repeat_found_encodeSnappyBlockAsm10B - LEAL 1(CX), SI - MOVL 12(SP), BP - MOVL SI, DI - SUBL 16(SP), DI + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeSnappyBlockAsm10B repeat_extend_back_loop_encodeSnappyBlockAsm10B: - CMPL SI, BP + CMPL DI, SI JLE repeat_extend_back_end_encodeSnappyBlockAsm10B - MOVB -1(DX)(DI*1), BL - MOVB -1(DX)(SI*1), R8 - CMPB BL, R8 + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 JNE repeat_extend_back_end_encodeSnappyBlockAsm10B - LEAL -1(SI), SI - DECL DI + LEAL -1(DI), DI + DECL R8 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B repeat_extend_back_end_encodeSnappyBlockAsm10B: - MOVL 12(SP), BP - CMPL BP, SI + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B - MOVL SI, DI - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R8 - SUBL BP, DI - LEAL -1(DI), BP - CMPL BP, $0x3c + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c JLT one_byte_repeat_emit_encodeSnappyBlockAsm10B - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_repeat_emit_encodeSnappyBlockAsm10B MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B two_bytes_repeat_emit_encodeSnappyBlockAsm10B: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_repeat_emit_encodeSnappyBlockAsm10B JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B one_byte_repeat_emit_encodeSnappyBlockAsm10B: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(DI*1), BP + LEAQ (AX)(R8*1), SI // genMemMoveShort - CMPQ DI, $0x03 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_3 - CMPQ DI, $0x08 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_4through7 - CMPQ DI, $0x10 + CMPQ R8, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8 + CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ DI, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_1or2: - MOVB (R8), R9 - MOVB -1(R8)(DI*1), R8 - MOVB R9, (AX) - MOVB R8, -1(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_3: - MOVW (R8), R9 - MOVB 2(R8), R8 - MOVW R9, (AX) - MOVB R8, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_4through7: - MOVL (R8), R9 - MOVL -4(R8)(DI*1), R8 - MOVL R9, (AX) - MOVL R8, -4(AX)(DI*1) +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8: + MOVQ (R9), R10 + MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (R8), R9 - MOVQ -8(R8)(DI*1), R8 - MOVQ R9, (AX) - MOVQ R8, -8(AX)(DI*1) + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(DI*1), X1 + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(DI*1) + MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B memmove_long_repeat_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(DI*1), BP + LEAQ (AX)(R8*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 + DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R8)(R11*1), X4 - MOVOU -16(R8)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ DI, R11 + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: ADDL $0x05, CX - MOVL CX, BP - SUBL 16(SP), BP - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BP*1), BP + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI // matchLen - XORL R10, R10 - CMPL DI, $0x08 + XORL R11, R11 + CMPL R8, $0x08 JL matchlen_single_repeat_extend_encodeSnappyBlockAsm10B matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: - MOVQ (R8)(R10*1), R9 - XORQ (BP)(R10*1), R9 - TESTQ R9, R9 + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B matchlen_single_repeat_extend_encodeSnappyBlockAsm10B: - TESTL DI, DI + TESTL R8, R8 JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B: - MOVB (R8)(R10*1), R9 - CMPB (BP)(R10*1), R9 + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B - LEAL 1(R10), R10 - DECL DI + LEAL 1(R11), R11 + DECL R8 JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B repeat_extend_forward_end_encodeSnappyBlockAsm10B: - ADDL R10, CX - MOVL CX, BP - SUBL SI, BP - MOVL 16(SP), SI + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI // emitCopy two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B: - CMPL BP, $0x40 + CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BP), BP + MOVW DI, 1(AX) + LEAL -60(SI), SI ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B: - CMPL BP, $0x0c + CMPL SI, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B - CMPL SI, $0x00000800 + CMPL DI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B MOVB $0x01, BL - LEAL -16(BX)(BP*4), BP - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm10B emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B: MOVB $0x02, BL - LEAL -4(BX)(BP*4), BP - MOVB BP, (AX) - MOVW SI, 1(AX) + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm10B: @@ -11129,16 +11374,16 @@ repeat_end_emit_encodeSnappyBlockAsm10B: JMP search_loop_encodeSnappyBlockAsm10B no_repeat_found_encodeSnappyBlockAsm10B: - CMPL (DX)(BP*1), SI + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeSnappyBlockAsm10B - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BP - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI JEQ candidate2_match_encodeSnappyBlockAsm10B - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BP*1), SI + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI JEQ candidate3_match_encodeSnappyBlockAsm10B MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm10B @@ -11148,285 +11393,266 @@ candidate3_match_encodeSnappyBlockAsm10B: JMP candidate_match_encodeSnappyBlockAsm10B candidate2_match_encodeSnappyBlockAsm10B: - MOVL R8, 24(SP)(R9*4) + MOVL R9, 24(SP)(R10*4) INCL CX - MOVL DI, BP + MOVL R8, SI candidate_match_encodeSnappyBlockAsm10B: - MOVL 12(SP), SI - TESTL BP, BP + MOVL 12(SP), DI + TESTL SI, SI JZ match_extend_back_end_encodeSnappyBlockAsm10B match_extend_back_loop_encodeSnappyBlockAsm10B: - CMPL CX, SI + CMPL CX, DI JLE match_extend_back_end_encodeSnappyBlockAsm10B - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(CX*1), DI - CMPB BL, DI + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 JNE match_extend_back_end_encodeSnappyBlockAsm10B LEAL -1(CX), CX - DECL BP + DECL SI JZ match_extend_back_end_encodeSnappyBlockAsm10B JMP match_extend_back_loop_encodeSnappyBlockAsm10B match_extend_back_end_encodeSnappyBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm10B: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c JLT one_byte_match_emit_encodeSnappyBlockAsm10B - CMPL DI, $0x00000100 + CMPL R8, $0x00000100 JLT two_bytes_match_emit_encodeSnappyBlockAsm10B MOVB $0xf4, (AX) - MOVW DI, 1(AX) + MOVW R8, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeSnappyBlockAsm10B two_bytes_match_emit_encodeSnappyBlockAsm10B: MOVB $0xf0, (AX) - MOVB DI, 1(AX) + MOVB R8, 1(AX) ADDQ $0x02, AX - CMPL DI, $0x40 + CMPL R8, $0x40 JL memmove_match_emit_encodeSnappyBlockAsm10B JMP memmove_long_match_emit_encodeSnappyBlockAsm10B one_byte_match_emit_encodeSnappyBlockAsm10B: - SHLB $0x02, DI - MOVB DI, (AX) + SHLB $0x02, R8 + MOVB R8, (AX) ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_1or2: - MOVB (SI), R9 - MOVB -1(SI)(R8*1), SI - MOVB R9, (AX) - MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_3: - MOVW (SI), R9 - MOVB 2(SI), SI - MOVW R9, (AX) - MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_4through7: - MOVL (SI), R9 - MOVL -4(SI)(R8*1), SI - MOVL R9, (AX) - MOVL SI, -4(AX)(R8*1) +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_encodeSnappyBlockAsm10B: - MOVQ DI, AX + MOVQ R8, AX JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B memmove_long_match_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 + DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX emit_literal_done_match_emit_encodeSnappyBlockAsm10B: match_nolit_loop_encodeSnappyBlockAsm10B: - MOVL CX, SI - SUBL BP, SI - MOVL SI, 16(SP) + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), BP + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI // matchLen - XORL R9, R9 - CMPL SI, $0x08 + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsm10B matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: - MOVQ (DI)(R9*1), R8 - XORQ (BP)(R9*1), R8 - TESTQ R8, R8 + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B - BSFQ R8, R8 - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeSnappyBlockAsm10B matchlen_loop_match_nolit_encodeSnappyBlockAsm10B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B matchlen_single_match_nolit_encodeSnappyBlockAsm10B: - TESTL SI, SI + TESTL DI, DI JZ match_nolit_end_encodeSnappyBlockAsm10B matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B: - MOVB (DI)(R9*1), R8 - CMPB (BP)(R9*1), R8 + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm10B - LEAL 1(R9), R9 - DECL SI + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B match_nolit_end_encodeSnappyBlockAsm10B: - ADDL R9, CX - MOVL 16(SP), BP - ADDL $0x04, R9 + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBlockAsm10B: - CMPL R9, $0x40 + CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(R9), R9 + MOVW SI, 1(AX) + LEAL -60(R10), R10 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B: - CMPL R9, $0x0c + CMPL R10, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B MOVB $0x01, BL - LEAL -16(BX)(R9*4), R9 - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B emit_copy_three_match_nolit_encodeSnappyBlockAsm10B: MOVB $0x02, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) - MOVW BP, 1(AX) + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeSnappyBlockAsm10B: CMPL CX, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm10B - MOVQ -2(DX)(CX*1), SI + MOVQ -2(DX)(CX*1), DI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeSnappyBlockAsm10B: - MOVQ $0x9e3779b1, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BP - SHLQ $0x20, DI - IMULQ R8, DI - SHRQ $0x36, DI - SHLQ $0x20, BP - IMULQ R8, BP - SHRQ $0x36, BP - LEAL -2(CX), R8 - LEAQ 24(SP)(BP*4), R9 - MOVL (R9), BP - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BP*1), SI + MOVQ $0x9e3779b1, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x20, R8 + IMULQ R9, R8 + SHRQ $0x36, R8 + SHLQ $0x20, SI + IMULQ R9, SI + SHRQ $0x36, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI JEQ match_nolit_loop_encodeSnappyBlockAsm10B INCL CX JMP search_loop_encodeSnappyBlockAsm10B @@ -11445,11 +11671,11 @@ emit_remainder_ok_encodeSnappyBlockAsm10B: MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B - MOVL CX, BP + MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX - SUBL BX, BP - LEAL -1(BP), DX + SUBL BX, SI + LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeSnappyBlockAsm10B CMPL DX, $0x00000100 @@ -11473,46 +11699,27 @@ one_byte_emit_remainder_encodeSnappyBlockAsm10B: ADDQ $0x01, AX memmove_emit_remainder_encodeSnappyBlockAsm10B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2: - MOVB (CX), BP - MOVB -1(CX)(BX*1), CL - MOVB BP, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3: - MOVW (CX), BP - MOVB 2(CX), CL - MOVW BP, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7: - MOVL (CX), BP - MOVL -4(CX)(BX*1), CX - MOVL BP, (AX) - MOVL CX, -4(AX)(BX*1) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B @@ -11538,43 +11745,43 @@ memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B: JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B memmove_long_emit_remainder_encodeSnappyBlockAsm10B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 - MOVQ BX, SI - SHRQ $0x05, SI - MOVQ AX, BP - ANDL $0x0000001f, BP - MOVQ $0x00000040, DI - SUBQ BP, DI - DECQ SI + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(CX)(DI*1), BP - LEAQ -32(AX)(DI*1), R8 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (BP), X4 - MOVOU 16(BP), X5 - MOVOA X4, (R8) - MOVOA X5, 16(R8) + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI ADDQ $0x20, R8 - ADDQ $0x20, BP - ADDQ $0x20, DI - DECQ SI + DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(CX)(DI*1), X4 - MOVOU -16(CX)(DI*1), X5 - MOVOA X4, -32(AX)(DI*1) - MOVOA X5, -16(AX)(DI*1) - ADDQ $0x20, DI - CMPQ BX, DI + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) @@ -11610,9 +11817,9 @@ zero_loop_encodeSnappyBlockAsm8B: JNZ zero_loop_encodeSnappyBlockAsm8B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX - LEAQ -5(CX), DX - LEAQ -8(CX), BP - MOVL BP, 8(SP) + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -11622,273 +11829,254 @@ zero_loop_encodeSnappyBlockAsm8B: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm8B: - MOVQ (DX)(CX*1), SI - MOVL CX, BP - SUBL 12(SP), BP - SHRL $0x04, BP - LEAL 4(CX)(BP*1), BP - CMPL BP, 8(SP) + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x04, SI + LEAL 4(CX)(SI*1), SI + CMPL SI, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm8B - MOVL BP, 20(SP) - MOVQ $0x9e3779b1, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x38, R9 + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x9e3779b1, R9 + MOVQ DI, R10 + MOVQ DI, R11 + SHRQ $0x08, R11 SHLQ $0x20, R10 - IMULQ R8, R10 + IMULQ R9, R10 SHRQ $0x38, R10 - MOVL 24(SP)(R9*4), BP - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x38, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 + SHLQ $0x20, R11 + IMULQ R9, R11 + SHRQ $0x38, R11 + MOVL 24(SP)(R10*4), SI + MOVL 24(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + LEAL 1(CX), R10 + MOVL R10, 24(SP)(R11*4) + MOVQ DI, R10 + SHRQ $0x10, R10 + SHLQ $0x20, R10 + IMULQ R9, R10 + SHRQ $0x38, R10 + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 JNE no_repeat_found_encodeSnappyBlockAsm8B - LEAL 1(CX), SI - MOVL 12(SP), BP - MOVL SI, DI - SUBL 16(SP), DI + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeSnappyBlockAsm8B repeat_extend_back_loop_encodeSnappyBlockAsm8B: - CMPL SI, BP + CMPL DI, SI JLE repeat_extend_back_end_encodeSnappyBlockAsm8B - MOVB -1(DX)(DI*1), BL - MOVB -1(DX)(SI*1), R8 - CMPB BL, R8 + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 JNE repeat_extend_back_end_encodeSnappyBlockAsm8B - LEAL -1(SI), SI - DECL DI + LEAL -1(DI), DI + DECL R8 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B repeat_extend_back_end_encodeSnappyBlockAsm8B: - MOVL 12(SP), BP - CMPL BP, SI + MOVL 12(SP), SI + CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B - MOVL SI, DI - MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R8 - SUBL BP, DI - LEAL -1(DI), BP - CMPL BP, $0x3c + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c JLT one_byte_repeat_emit_encodeSnappyBlockAsm8B - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_repeat_emit_encodeSnappyBlockAsm8B MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B two_bytes_repeat_emit_encodeSnappyBlockAsm8B: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_repeat_emit_encodeSnappyBlockAsm8B JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B one_byte_repeat_emit_encodeSnappyBlockAsm8B: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(DI*1), BP + LEAQ (AX)(R8*1), SI // genMemMoveShort - CMPQ DI, $0x03 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_3 - CMPQ DI, $0x08 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_4through7 - CMPQ DI, $0x10 + CMPQ R8, $0x08 + JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8 + CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ DI, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_1or2: - MOVB (R8), R9 - MOVB -1(R8)(DI*1), R8 - MOVB R9, (AX) - MOVB R8, -1(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_3: - MOVW (R8), R9 - MOVB 2(R8), R8 - MOVW R9, (AX) - MOVB R8, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_4through7: - MOVL (R8), R9 - MOVL -4(R8)(DI*1), R8 - MOVL R9, (AX) - MOVL R8, -4(AX)(DI*1) +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8: + MOVQ (R9), R10 + MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (R8), R9 - MOVQ -8(R8)(DI*1), R8 - MOVQ R9, (AX) - MOVQ R8, -8(AX)(DI*1) + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(DI*1), X1 + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(DI*1) + MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B: - MOVQ BP, AX + MOVQ SI, AX JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B memmove_long_repeat_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(DI*1), BP + LEAQ (AX)(R8*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 + DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R8)(R11*1), X4 - MOVOU -16(R8)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ DI, R11 + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - MOVQ BP, AX + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: ADDL $0x05, CX - MOVL CX, BP - SUBL 16(SP), BP - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BP*1), BP + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI // matchLen - XORL R10, R10 - CMPL DI, $0x08 + XORL R11, R11 + CMPL R8, $0x08 JL matchlen_single_repeat_extend_encodeSnappyBlockAsm8B matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: - MOVQ (R8)(R10*1), R9 - XORQ (BP)(R10*1), R9 - TESTQ R9, R9 + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B matchlen_single_repeat_extend_encodeSnappyBlockAsm8B: - TESTL DI, DI + TESTL R8, R8 JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B: - MOVB (R8)(R10*1), R9 - CMPB (BP)(R10*1), R9 + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B - LEAL 1(R10), R10 - DECL DI + LEAL 1(R11), R11 + DECL R8 JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B repeat_extend_forward_end_encodeSnappyBlockAsm8B: - ADDL R10, CX - MOVL CX, BP - SUBL SI, BP - MOVL 16(SP), SI + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI // emitCopy two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B: - CMPL BP, $0x40 + CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BP), BP + MOVW DI, 1(AX) + LEAL -60(SI), SI ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B: - CMPL BP, $0x0c + CMPL SI, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B MOVB $0x01, BL - LEAL -16(BX)(BP*4), BP - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, BP - MOVB BP, (AX) + LEAL -16(BX)(SI*4), SI + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm8B emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B: MOVB $0x02, BL - LEAL -4(BX)(BP*4), BP - MOVB BP, (AX) - MOVW SI, 1(AX) + LEAL -4(BX)(SI*4), SI + MOVB SI, (AX) + MOVW DI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm8B: @@ -11896,16 +12084,16 @@ repeat_end_emit_encodeSnappyBlockAsm8B: JMP search_loop_encodeSnappyBlockAsm8B no_repeat_found_encodeSnappyBlockAsm8B: - CMPL (DX)(BP*1), SI + CMPL (DX)(SI*1), DI JEQ candidate_match_encodeSnappyBlockAsm8B - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BP - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI + SHRQ $0x08, DI + MOVL 24(SP)(R10*4), SI + LEAL 2(CX), R9 + CMPL (DX)(R8*1), DI JEQ candidate2_match_encodeSnappyBlockAsm8B - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BP*1), SI + MOVL R9, 24(SP)(R10*4) + SHRQ $0x08, DI + CMPL (DX)(SI*1), DI JEQ candidate3_match_encodeSnappyBlockAsm8B MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm8B @@ -11915,283 +12103,264 @@ candidate3_match_encodeSnappyBlockAsm8B: JMP candidate_match_encodeSnappyBlockAsm8B candidate2_match_encodeSnappyBlockAsm8B: - MOVL R8, 24(SP)(R9*4) + MOVL R9, 24(SP)(R10*4) INCL CX - MOVL DI, BP + MOVL R8, SI candidate_match_encodeSnappyBlockAsm8B: - MOVL 12(SP), SI - TESTL BP, BP + MOVL 12(SP), DI + TESTL SI, SI JZ match_extend_back_end_encodeSnappyBlockAsm8B match_extend_back_loop_encodeSnappyBlockAsm8B: - CMPL CX, SI + CMPL CX, DI JLE match_extend_back_end_encodeSnappyBlockAsm8B - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(CX*1), DI - CMPB BL, DI + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 JNE match_extend_back_end_encodeSnappyBlockAsm8B LEAL -1(CX), CX - DECL BP + DECL SI JZ match_extend_back_end_encodeSnappyBlockAsm8B JMP match_extend_back_loop_encodeSnappyBlockAsm8B match_extend_back_end_encodeSnappyBlockAsm8B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm8B: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI + MOVL CX, DI + MOVL 12(SP), R8 + CMPL R8, DI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(R8*1), DI + SUBL R8, R9 + LEAL -1(R9), R8 + CMPL R8, $0x3c JLT one_byte_match_emit_encodeSnappyBlockAsm8B - CMPL DI, $0x00000100 + CMPL R8, $0x00000100 JLT two_bytes_match_emit_encodeSnappyBlockAsm8B MOVB $0xf4, (AX) - MOVW DI, 1(AX) + MOVW R8, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeSnappyBlockAsm8B two_bytes_match_emit_encodeSnappyBlockAsm8B: MOVB $0xf0, (AX) - MOVB DI, 1(AX) + MOVB R8, 1(AX) ADDQ $0x02, AX - CMPL DI, $0x40 + CMPL R8, $0x40 JL memmove_match_emit_encodeSnappyBlockAsm8B JMP memmove_long_match_emit_encodeSnappyBlockAsm8B one_byte_match_emit_encodeSnappyBlockAsm8B: - SHLB $0x02, DI - MOVB DI, (AX) + SHLB $0x02, R8 + MOVB R8, (AX) ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveShort - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_4through7 - CMPQ R8, $0x10 + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8 + CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_1or2: - MOVB (SI), R9 - MOVB -1(SI)(R8*1), SI - MOVB R9, (AX) - MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_3: - MOVW (SI), R9 - MOVB 2(SI), SI - MOVW R9, (AX) - MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_4through7: - MOVL (SI), R9 - MOVL -4(SI)(R8*1), SI - MOVL R9, (AX) - MOVL SI, -4(AX)(R8*1) +emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8: + MOVQ (DI), R10 + MOVQ R10, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) + MOVQ (DI), R10 + MOVQ -8(DI)(R9*1), DI + MOVQ R10, (AX) + MOVQ DI, -8(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 + MOVOU (DI), X0 + MOVOU -16(DI)(R9*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) + MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) memmove_end_copy_match_emit_encodeSnappyBlockAsm8B: - MOVQ DI, AX + MOVQ R8, AX JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B memmove_long_match_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R9*1), R8 // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R9*1), X2 + MOVOU -16(DI)(R9*1), X3 + MOVQ R9, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + LEAQ -32(DI)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 + DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 + MOVOU -32(DI)(R12*1), X4 + MOVOU -16(DI)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R8, AX emit_literal_done_match_emit_encodeSnappyBlockAsm8B: match_nolit_loop_encodeSnappyBlockAsm8B: - MOVL CX, SI - SUBL BP, SI - MOVL SI, 16(SP) + MOVL CX, DI + SUBL SI, DI + MOVL DI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), BP + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(SI*1), SI // matchLen - XORL R9, R9 - CMPL SI, $0x08 + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsm8B matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: - MOVQ (DI)(R9*1), R8 - XORQ (BP)(R9*1), R8 - TESTQ R8, R8 + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B - BSFQ R8, R8 - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeSnappyBlockAsm8B matchlen_loop_match_nolit_encodeSnappyBlockAsm8B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B matchlen_single_match_nolit_encodeSnappyBlockAsm8B: - TESTL SI, SI + TESTL DI, DI JZ match_nolit_end_encodeSnappyBlockAsm8B matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B: - MOVB (DI)(R9*1), R8 - CMPB (BP)(R9*1), R8 + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm8B - LEAL 1(R9), R9 - DECL SI + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B match_nolit_end_encodeSnappyBlockAsm8B: - ADDL R9, CX - MOVL 16(SP), BP - ADDL $0x04, R9 + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBlockAsm8B: - CMPL R9, $0x40 + CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(R9), R9 + MOVW SI, 1(AX) + LEAL -60(R10), R10 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B: - CMPL R9, $0x0c + CMPL R10, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B MOVB $0x01, BL - LEAL -16(BX)(R9*4), R9 - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B emit_copy_three_match_nolit_encodeSnappyBlockAsm8B: MOVB $0x02, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) - MOVW BP, 1(AX) + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeSnappyBlockAsm8B: CMPL CX, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm8B - MOVQ -2(DX)(CX*1), SI + MOVQ -2(DX)(CX*1), DI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeSnappyBlockAsm8B: - MOVQ $0x9e3779b1, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BP - SHLQ $0x20, DI - IMULQ R8, DI - SHRQ $0x38, DI - SHLQ $0x20, BP - IMULQ R8, BP - SHRQ $0x38, BP - LEAL -2(CX), R8 - LEAQ 24(SP)(BP*4), R9 - MOVL (R9), BP - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BP*1), SI + MOVQ $0x9e3779b1, R9 + MOVQ DI, R8 + SHRQ $0x10, DI + MOVQ DI, SI + SHLQ $0x20, R8 + IMULQ R9, R8 + SHRQ $0x38, R8 + SHLQ $0x20, SI + IMULQ R9, SI + SHRQ $0x38, SI + LEAL -2(CX), R9 + LEAQ 24(SP)(SI*4), R10 + MOVL (R10), SI + MOVL R9, 24(SP)(R8*4) + MOVL CX, (R10) + CMPL (DX)(SI*1), DI JEQ match_nolit_loop_encodeSnappyBlockAsm8B INCL CX JMP search_loop_encodeSnappyBlockAsm8B @@ -12210,11 +12379,11 @@ emit_remainder_ok_encodeSnappyBlockAsm8B: MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B - MOVL CX, BP + MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX - SUBL BX, BP - LEAL -1(BP), DX + SUBL BX, SI + LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeSnappyBlockAsm8B CMPL DX, $0x00000100 @@ -12238,46 +12407,27 @@ one_byte_emit_remainder_encodeSnappyBlockAsm8B: ADDQ $0x01, AX memmove_emit_remainder_encodeSnappyBlockAsm8B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2: - MOVB (CX), BP - MOVB -1(CX)(BX*1), CL - MOVB BP, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3: - MOVW (CX), BP - MOVB 2(CX), CL - MOVW BP, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7: - MOVL (CX), BP - MOVL -4(CX)(BX*1), CX - MOVL BP, (AX) - MOVL CX, -4(AX)(BX*1) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B @@ -12303,43 +12453,43 @@ memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B: JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B memmove_long_emit_remainder_encodeSnappyBlockAsm8B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(SI*1), DX + MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 - MOVQ BX, SI - SHRQ $0x05, SI - MOVQ AX, BP - ANDL $0x0000001f, BP - MOVQ $0x00000040, DI - SUBQ BP, DI - DECQ SI + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(CX)(DI*1), BP - LEAQ -32(AX)(DI*1), R8 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (BP), X4 - MOVOU 16(BP), X5 - MOVOA X4, (R8) - MOVOA X5, 16(R8) + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI ADDQ $0x20, R8 - ADDQ $0x20, BP - ADDQ $0x20, DI - DECQ SI + DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(CX)(DI*1), X4 - MOVOU -16(CX)(DI*1), X5 - MOVOA X4, -32(AX)(DI*1) - MOVOA X5, -16(AX)(DI*1) - ADDQ $0x20, DI - CMPQ BX, DI + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) @@ -12353,6 +12503,2598 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: MOVQ AX, ret+48(FP) RET +// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000a00, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBetterBlockAsm: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBetterBlockAsm + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBetterBlockAsm: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x07, SI + CMPL SI, $0x63 + JLE check_maxskip_ok_encodeSnappyBetterBlockAsm + LEAL 100(CX), SI + JMP check_maxskip_cont_encodeSnappyBetterBlockAsm + +check_maxskip_ok_encodeSnappyBetterBlockAsm: + LEAL 1(CX)(SI*1), SI + +check_maxskip_cont_encodeSnappyBetterBlockAsm: + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 262168(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 262168(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm + +candidateS_match_encodeSnappyBetterBlockAsm: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm + DECL CX + MOVL R8, SI + +candidate_match_encodeSnappyBetterBlockAsm: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm + +match_extend_back_loop_encodeSnappyBetterBlockAsm: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBetterBlockAsm + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBetterBlockAsm + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm + JMP match_extend_back_loop_encodeSnappyBetterBlockAsm + +match_extend_back_end_encodeSnappyBetterBlockAsm: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 5(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBetterBlockAsm: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm + +matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm + +matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm + +matchlen_single_match_nolit_encodeSnappyBetterBlockAsm: + TESTL R8, R8 + JZ match_nolit_end_encodeSnappyBetterBlockAsm + +matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeSnappyBetterBlockAsm + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm + +match_nolit_end_encodeSnappyBetterBlockAsm: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + CMPL R12, $0x01 + JG match_length_ok_encodeSnappyBetterBlockAsm + CMPL R8, $0x0000ffff + JLE match_length_ok_encodeSnappyBetterBlockAsm + MOVL 20(SP), CX + INCL CX + JMP search_loop_encodeSnappyBetterBlockAsm + +match_length_ok_encodeSnappyBetterBlockAsm: + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeSnappyBetterBlockAsm + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm + CMPL SI, $0x00010000 + JLT three_bytes_match_emit_encodeSnappyBetterBlockAsm + CMPL SI, $0x01000000 + JLT four_bytes_match_emit_encodeSnappyBetterBlockAsm + MOVB $0xfc, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm + +four_bytes_match_emit_encodeSnappyBetterBlockAsm: + MOVL SI, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm + +three_bytes_match_emit_encodeSnappyBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm + +two_bytes_match_emit_encodeSnappyBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeSnappyBetterBlockAsm + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm + +one_byte_match_emit_encodeSnappyBetterBlockAsm: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBetterBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm + +memmove_long_match_emit_encodeSnappyBetterBlockAsm: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeSnappyBetterBlockAsm: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy + CMPL R8, $0x00010000 + JL two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm + +four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm: + CMPL R12, $0x40 + JLE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm + MOVB $0xff, (AX) + MOVL R8, 1(AX) + LEAL -64(R12), R12 + ADDQ $0x05, AX + CMPL R12, $0x04 + JL four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm + JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm + +four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm: + TESTL R12, R12 + JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVL R8, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm + +two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm + +two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm + +emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBetterBlockAsm: + MOVQ $0x00cf1bbcdcbfa563, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x32, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 262168(SP)(R11*4) + MOVL R15, 262168(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 262168(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeSnappyBetterBlockAsm + +emit_remainder_encodeSnappyBetterBlockAsm: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 5(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBetterBlockAsm: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeSnappyBetterBlockAsm + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeSnappyBetterBlockAsm + MOVB $0xfc, (AX) + MOVL DX, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm + +four_bytes_emit_remainder_encodeSnappyBetterBlockAsm: + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm + +three_bytes_emit_remainder_encodeSnappyBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm + +two_bytes_emit_remainder_encodeSnappyBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBetterBlockAsm + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm + +one_byte_emit_remainder_encodeSnappyBetterBlockAsm: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBetterBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm + +memmove_long_emit_remainder_encodeSnappyBetterBlockAsm: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000a00, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBetterBlockAsm64K: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBetterBlockAsm64K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBetterBlockAsm64K: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x07, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm64K + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x32, R11 + MOVL 24(SP)(R10*4), SI + MOVL 262168(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 262168(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm64K + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm64K + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm64K + +candidateS_match_encodeSnappyBetterBlockAsm64K: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm64K + DECL CX + MOVL R8, SI + +candidate_match_encodeSnappyBetterBlockAsm64K: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K + +match_extend_back_loop_encodeSnappyBetterBlockAsm64K: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBetterBlockAsm64K + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K + JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K + +match_extend_back_end_encodeSnappyBetterBlockAsm64K: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBetterBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBetterBlockAsm64K: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K + +matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm64K + +matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K + +matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K: + TESTL R8, R8 + JZ match_nolit_end_encodeSnappyBetterBlockAsm64K + +matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeSnappyBetterBlockAsm64K + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K + +match_nolit_end_encodeSnappyBetterBlockAsm64K: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeSnappyBetterBlockAsm64K + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm64K + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K + +two_bytes_match_emit_encodeSnappyBetterBlockAsm64K: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeSnappyBetterBlockAsm64K + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K + +one_byte_match_emit_encodeSnappyBetterBlockAsm64K: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBetterBlockAsm64K: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K + +memmove_long_match_emit_encodeSnappyBetterBlockAsm64K: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K + +two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K + +emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm64K + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K: + MOVQ $0x00cf1bbcdcbfa563, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x32, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 262168(SP)(R11*4) + MOVL R15, 262168(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x08, R10 + IMULQ SI, R10 + SHRQ $0x30, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 + SHLQ $0x08, R13 + IMULQ SI, R13 + SHRQ $0x30, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 262168(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeSnappyBetterBlockAsm64K + +emit_remainder_encodeSnappyBetterBlockAsm64K: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBetterBlockAsm64K + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBetterBlockAsm64K: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K + +two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBetterBlockAsm64K + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K + +one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBetterBlockAsm64K: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K + +memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000280, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBetterBlockAsm12B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBetterBlockAsm12B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBetterBlockAsm12B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm12B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x34, R11 + MOVL 24(SP)(R10*4), SI + MOVL 65560(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 65560(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm12B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm12B + +candidateS_match_encodeSnappyBetterBlockAsm12B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm12B + DECL CX + MOVL R8, SI + +candidate_match_encodeSnappyBetterBlockAsm12B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B + +match_extend_back_loop_encodeSnappyBetterBlockAsm12B: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBetterBlockAsm12B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B + JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B + +match_extend_back_end_encodeSnappyBetterBlockAsm12B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBetterBlockAsm12B: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B + +matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm12B + +matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B + +matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B: + TESTL R8, R8 + JZ match_nolit_end_encodeSnappyBetterBlockAsm12B + +matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeSnappyBetterBlockAsm12B + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B + +match_nolit_end_encodeSnappyBetterBlockAsm12B: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeSnappyBetterBlockAsm12B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B + +two_bytes_match_emit_encodeSnappyBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeSnappyBetterBlockAsm12B + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B + +one_byte_match_emit_encodeSnappyBetterBlockAsm12B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBetterBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B + +memmove_long_match_emit_encodeSnappyBetterBlockAsm12B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B + +two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B + +emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm12B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B: + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x32, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x32, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x34, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x34, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 65560(SP)(R11*4) + MOVL R15, 65560(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x32, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x34, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x32, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 65560(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeSnappyBetterBlockAsm12B + +emit_remainder_encodeSnappyBetterBlockAsm12B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBetterBlockAsm12B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B + +two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBetterBlockAsm12B + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B + +one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBetterBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B + +memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x000000a0, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBetterBlockAsm10B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBetterBlockAsm10B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBetterBlockAsm10B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm10B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x36, R11 + MOVL 24(SP)(R10*4), SI + MOVL 16408(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 16408(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm10B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm10B + +candidateS_match_encodeSnappyBetterBlockAsm10B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm10B + DECL CX + MOVL R8, SI + +candidate_match_encodeSnappyBetterBlockAsm10B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B + +match_extend_back_loop_encodeSnappyBetterBlockAsm10B: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBetterBlockAsm10B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B + JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B + +match_extend_back_end_encodeSnappyBetterBlockAsm10B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBetterBlockAsm10B: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B + +matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm10B + +matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B + +matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B: + TESTL R8, R8 + JZ match_nolit_end_encodeSnappyBetterBlockAsm10B + +matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeSnappyBetterBlockAsm10B + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B + +match_nolit_end_encodeSnappyBetterBlockAsm10B: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeSnappyBetterBlockAsm10B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B + +two_bytes_match_emit_encodeSnappyBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeSnappyBetterBlockAsm10B + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B + +one_byte_match_emit_encodeSnappyBetterBlockAsm10B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBetterBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B + +memmove_long_match_emit_encodeSnappyBetterBlockAsm10B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B + +two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B + CMPL R8, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B + +emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm10B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B: + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x34, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x34, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x36, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x36, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 16408(SP)(R11*4) + MOVL R15, 16408(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x34, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x36, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x34, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 16408(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeSnappyBetterBlockAsm10B + +emit_remainder_encodeSnappyBetterBlockAsm10B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBetterBlockAsm10B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B + +two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBetterBlockAsm10B + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B + +one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBetterBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B + +memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000028, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeSnappyBetterBlockAsm8B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeSnappyBetterBlockAsm8B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -9(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeSnappyBetterBlockAsm8B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x04, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm8B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x38, R11 + MOVL 24(SP)(R10*4), SI + MOVL 4120(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 4120(SP)(R11*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm8B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm8B + +candidateS_match_encodeSnappyBetterBlockAsm8B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeSnappyBetterBlockAsm8B + DECL CX + MOVL R8, SI + +candidate_match_encodeSnappyBetterBlockAsm8B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B + +match_extend_back_loop_encodeSnappyBetterBlockAsm8B: + CMPL CX, DI + JLE match_extend_back_end_encodeSnappyBetterBlockAsm8B + MOVB -1(DX)(SI*1), BL + MOVB -1(DX)(CX*1), R8 + CMPB BL, R8 + JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B + LEAL -1(CX), CX + DECL SI + JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B + JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B + +match_extend_back_end_encodeSnappyBetterBlockAsm8B: + MOVL CX, DI + SUBL 12(SP), DI + LEAQ 3(AX)(DI*1), DI + CMPQ DI, (SP) + JL match_dst_size_check_encodeSnappyBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeSnappyBetterBlockAsm8B: + MOVL CX, DI + ADDL $0x04, CX + ADDL $0x04, SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), R10 + + // matchLen + XORL R12, R12 + CMPL R8, $0x08 + JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B + +matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: + MOVQ (R9)(R12*1), R11 + XORQ (R10)(R12*1), R11 + TESTQ R11, R11 + JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B + BSFQ R11, R11 + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm8B + +matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B: + LEAL -8(R8), R8 + LEAL 8(R12), R12 + CMPL R8, $0x08 + JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B + +matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B: + TESTL R8, R8 + JZ match_nolit_end_encodeSnappyBetterBlockAsm8B + +matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: + MOVB (R9)(R12*1), R11 + CMPB (R10)(R12*1), R11 + JNE match_nolit_end_encodeSnappyBetterBlockAsm8B + LEAL 1(R12), R12 + DECL R8 + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B + +match_nolit_end_encodeSnappyBetterBlockAsm8B: + MOVL CX, R8 + SUBL SI, R8 + + // Check if repeat + MOVL R8, 16(SP) + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B + MOVL DI, R9 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R10 + SUBL SI, R9 + LEAL -1(R9), SI + CMPL SI, $0x3c + JLT one_byte_match_emit_encodeSnappyBetterBlockAsm8B + CMPL SI, $0x00000100 + JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B + +two_bytes_match_emit_encodeSnappyBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_match_emit_encodeSnappyBetterBlockAsm8B + JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B + +one_byte_match_emit_encodeSnappyBetterBlockAsm8B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeSnappyBetterBlockAsm8B: + LEAQ (AX)(R9*1), SI + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8: + MOVQ (R10), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: + MOVQ (R10), R11 + MOVQ -8(R10)(R9*1), R10 + MOVQ R11, (AX) + MOVQ R10, -8(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: + MOVOU (R10), X0 + MOVOU -16(R10)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B: + MOVQ SI, AX + JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B + +memmove_long_match_emit_encodeSnappyBetterBlockAsm8B: + LEAQ (AX)(R9*1), SI + + // genMemMoveLong + MOVOU (R10), X0 + MOVOU 16(R10), X1 + MOVOU -32(R10)(R9*1), X2 + MOVOU -16(R10)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R14 + SUBQ R11, R14 + DECQ R13 + JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R10)(R14*1), R11 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R11 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R10)(R14*1), X4 + MOVOU -16(R10)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ SI, AX + +emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B: + ADDL R12, CX + ADDL $0x04, R12 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B: + CMPL R12, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R12), R12 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B + +two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B: + CMPL R12, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(R12*4), R12 + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B + +emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(R12*4), R12 + MOVB R12, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeSnappyBetterBlockAsm8B + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B: + MOVQ $0x0000cf1bbcdcbf9b, SI + MOVQ $0x9e3779b1, R8 + INCL DI + MOVQ (DX)(DI*1), R9 + MOVQ R9, R10 + MOVQ R9, R11 + MOVQ R9, R12 + SHRQ $0x08, R11 + MOVQ R11, R13 + SHRQ $0x10, R12 + LEAL 1(DI), R14 + LEAL 2(DI), R15 + MOVQ -2(DX)(CX*1), R9 + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x36, R10 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x36, R13 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x38, R11 + SHLQ $0x20, R12 + IMULQ R8, R12 + SHRQ $0x38, R12 + MOVL DI, 24(SP)(R10*4) + MOVL R14, 24(SP)(R13*4) + MOVL R14, 4120(SP)(R11*4) + MOVL R15, 4120(SP)(R12*4) + MOVQ R9, R10 + MOVQ R9, R11 + SHRQ $0x08, R11 + MOVQ R11, R13 + LEAL -2(CX), R9 + LEAL -1(CX), DI + SHLQ $0x10, R10 + IMULQ SI, R10 + SHRQ $0x36, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x38, R11 + SHLQ $0x10, R13 + IMULQ SI, R13 + SHRQ $0x36, R13 + MOVL R9, 24(SP)(R10*4) + MOVL DI, 4120(SP)(R11*4) + MOVL DI, 24(SP)(R13*4) + JMP search_loop_encodeSnappyBetterBlockAsm8B + +emit_remainder_encodeSnappyBetterBlockAsm8B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeSnappyBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeSnappyBetterBlockAsm8B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B + MOVL CX, SI + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, SI + LEAL -1(SI), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B + +two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeSnappyBetterBlockAsm8B + JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B + +one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeSnappyBetterBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveShort + CMPQ BX, $0x08 + JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8: + MOVQ (CX), SI + MOVQ SI, (AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: + MOVQ (CX), SI + MOVQ -8(CX)(BX*1), CX + MOVQ SI, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B + +memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B: + LEAQ (AX)(SI*1), DX + MOVL SI, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ BX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + // func emitLiteral(dst []byte, lit []byte) int // Requires: SSE2 TEXT ·emitLiteral(SB), NOSPLIT, $0-56 @@ -12362,50 +15104,50 @@ TEXT ·emitLiteral(SB), NOSPLIT, $0-56 TESTQ DX, DX JZ emit_literal_end_standalone_skip MOVL DX, BX - LEAL -1(DX), BP - CMPL BP, $0x3c + LEAL -1(DX), SI + CMPL SI, $0x3c JLT one_byte_standalone - CMPL BP, $0x00000100 + CMPL SI, $0x00000100 JLT two_bytes_standalone - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JLT three_bytes_standalone - CMPL BP, $0x01000000 + CMPL SI, $0x01000000 JLT four_bytes_standalone MOVB $0xfc, (AX) - MOVL BP, 1(AX) + MOVL SI, 1(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP memmove_long_standalone four_bytes_standalone: - MOVL BP, SI - SHRL $0x10, SI + MOVL SI, DI + SHRL $0x10, DI MOVB $0xf8, (AX) - MOVW BP, 1(AX) - MOVB SI, 3(AX) + MOVW SI, 1(AX) + MOVB DI, 3(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP memmove_long_standalone three_bytes_standalone: MOVB $0xf4, (AX) - MOVW BP, 1(AX) + MOVW SI, 1(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP memmove_long_standalone two_bytes_standalone: MOVB $0xf0, (AX) - MOVB BP, 1(AX) + MOVB SI, 1(AX) ADDQ $0x02, BX ADDQ $0x02, AX - CMPL BP, $0x40 + CMPL SI, $0x40 JL memmove_standalone JMP memmove_long_standalone one_byte_standalone: - SHLB $0x02, BP - MOVB BP, (AX) + SHLB $0x02, SI + MOVB SI, (AX) ADDQ $0x01, BX ADDQ $0x01, AX @@ -12423,30 +15165,30 @@ memmove_standalone: JMP emit_lit_memmove_standalone_memmove_move_33through64 emit_lit_memmove_standalone_memmove_move_1or2: - MOVB (CX), BP + MOVB (CX), SI MOVB -1(CX)(DX*1), CL - MOVB BP, (AX) + MOVB SI, (AX) MOVB CL, -1(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_3: - MOVW (CX), BP + MOVW (CX), SI MOVB 2(CX), CL - MOVW BP, (AX) + MOVW SI, (AX) MOVB CL, 2(AX) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_4through7: - MOVL (CX), BP + MOVL (CX), SI MOVL -4(CX)(DX*1), CX - MOVL BP, (AX) + MOVL SI, (AX) MOVL CX, -4(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_8through16: - MOVQ (CX), BP + MOVQ (CX), SI MOVQ -8(CX)(DX*1), CX - MOVQ BP, (AX) + MOVQ SI, (AX) MOVQ CX, -8(AX)(DX*1) JMP emit_literal_end_standalone @@ -12475,35 +15217,35 @@ memmove_long_standalone: MOVOU 16(CX), X1 MOVOU -32(CX)(DX*1), X2 MOVOU -16(CX)(DX*1), X3 - MOVQ DX, SI - SHRQ $0x05, SI - MOVQ AX, BP - ANDL $0x0000001f, BP - MOVQ $0x00000040, DI - SUBQ BP, DI - DECQ SI + MOVQ DX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 - LEAQ -32(CX)(DI*1), BP - LEAQ -32(AX)(DI*1), R8 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_standalonelarge_big_loop_back: - MOVOU (BP), X4 - MOVOU 16(BP), X5 - MOVOA X4, (R8) - MOVOA X5, 16(R8) + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI ADDQ $0x20, R8 - ADDQ $0x20, BP - ADDQ $0x20, DI - DECQ SI + DECQ DI JNA emit_lit_memmove_long_standalonelarge_big_loop_back emit_lit_memmove_long_standalonelarge_forward_sse_loop_32: - MOVOU -32(CX)(DI*1), X4 - MOVOU -16(CX)(DI*1), X5 - MOVOA X4, -32(AX)(DI*1) - MOVOA X5, -16(AX)(DI*1) - ADDQ $0x20, DI - CMPQ DX, DI + MOVOU -32(CX)(R8*1), X4 + MOVOU -16(CX)(R8*1), X5 + MOVOA X4, -32(AX)(R8*1) + MOVOA X5, -16(AX)(R8*1) + ADDQ $0x20, R8 + CMPQ DX, R8 JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) @@ -12528,11 +15270,11 @@ TEXT ·emitRepeat(SB), NOSPLIT, $0-48 // emitRepeat emit_repeat_again_standalone: - MOVL DX, BP + MOVL DX, SI LEAL -4(DX), DX - CMPL BP, $0x08 + CMPL SI, $0x08 JLE repeat_two_standalone - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_standalone CMPL CX, $0x00000800 JLT repeat_two_offset_standalone @@ -12588,8 +15330,8 @@ repeat_two_standalone: JMP gen_emit_repeat_end repeat_two_offset_standalone: - XORQ BP, BP - LEAL 1(BP)(DX*4), DX + XORQ SI, SI + LEAL 1(SI)(DX*4), DX MOVB CL, 1(AX) SARL $0x08, CX SHLL $0x05, CX @@ -12626,11 +15368,11 @@ four_bytes_loop_back_standalone: // emitRepeat emit_repeat_again_standalone_emit_copy: - MOVL DX, BP + MOVL DX, SI LEAL -4(DX), DX - CMPL BP, $0x08 + CMPL SI, $0x08 JLE repeat_two_standalone_emit_copy - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_standalone_emit_copy CMPL CX, $0x00000800 JLT repeat_two_offset_standalone_emit_copy @@ -12686,8 +15428,8 @@ repeat_two_standalone_emit_copy: JMP gen_emit_copy_end repeat_two_offset_standalone_emit_copy: - XORQ BP, BP - LEAL 1(BP)(DX*4), DX + XORQ SI, SI + LEAL 1(SI)(DX*4), DX MOVB CL, 1(AX) SARL $0x08, CX SHLL $0x05, CX @@ -12701,8 +15443,8 @@ repeat_two_offset_standalone_emit_copy: four_bytes_remain_standalone: TESTL DX, DX JZ gen_emit_copy_end - MOVB $0x03, BP - LEAL -4(BP)(DX*4), DX + MOVB $0x03, SI + LEAL -4(SI)(DX*4), DX MOVB DL, (AX) MOVL CX, 1(AX) ADDQ $0x05, BX @@ -12720,11 +15462,11 @@ two_byte_offset_standalone: // emitRepeat emit_repeat_again_standalone_emit_copy_short: - MOVL DX, BP + MOVL DX, SI LEAL -4(DX), DX - CMPL BP, $0x08 + CMPL SI, $0x08 JLE repeat_two_standalone_emit_copy_short - CMPL BP, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_standalone_emit_copy_short CMPL CX, $0x00000800 JLT repeat_two_offset_standalone_emit_copy_short @@ -12780,8 +15522,8 @@ repeat_two_standalone_emit_copy_short: JMP gen_emit_copy_end repeat_two_offset_standalone_emit_copy_short: - XORQ BP, BP - LEAL 1(BP)(DX*4), DX + XORQ SI, SI + LEAL 1(SI)(DX*4), DX MOVB CL, 1(AX) SARL $0x08, CX SHLL $0x05, CX @@ -12797,8 +15539,8 @@ two_byte_offset_short_standalone: JGE emit_copy_three_standalone CMPL CX, $0x00000800 JGE emit_copy_three_standalone - MOVB $0x01, BP - LEAL -16(BP)(DX*4), DX + MOVB $0x01, SI + LEAL -16(SI)(DX*4), DX MOVB CL, 1(AX) SHRL $0x08, CX SHLL $0x05, CX @@ -12809,8 +15551,8 @@ two_byte_offset_short_standalone: JMP gen_emit_copy_end emit_copy_three_standalone: - MOVB $0x02, BP - LEAL -4(BP)(DX*4), DX + MOVB $0x02, SI + LEAL -4(SI)(DX*4), DX MOVB DL, (AX) MOVW CX, 1(AX) ADDQ $0x03, BX @@ -12846,8 +15588,8 @@ four_bytes_loop_back_standalone_snappy: four_bytes_remain_standalone_snappy: TESTL DX, DX JZ gen_emit_copy_end_snappy - MOVB $0x03, BP - LEAL -4(BP)(DX*4), DX + MOVB $0x03, SI + LEAL -4(SI)(DX*4), DX MOVB DL, (AX) MOVL CX, 1(AX) ADDQ $0x05, BX @@ -12869,8 +15611,8 @@ two_byte_offset_short_standalone_snappy: JGE emit_copy_three_standalone_snappy CMPL CX, $0x00000800 JGE emit_copy_three_standalone_snappy - MOVB $0x01, BP - LEAL -16(BP)(DX*4), DX + MOVB $0x01, SI + LEAL -16(SI)(DX*4), DX MOVB CL, 1(AX) SHRL $0x08, CX SHLL $0x05, CX @@ -12881,8 +15623,8 @@ two_byte_offset_short_standalone_snappy: JMP gen_emit_copy_end_snappy emit_copy_three_standalone_snappy: - MOVB $0x02, BP - LEAL -4(BP)(DX*4), DX + MOVB $0x02, SI + LEAL -4(SI)(DX*4), DX MOVB DL, (AX) MOVW CX, 1(AX) ADDQ $0x03, BX @@ -12899,23 +15641,23 @@ TEXT ·matchLen(SB), NOSPLIT, $0-56 MOVQ a_len+8(FP), DX // matchLen - XORL BP, BP + XORL SI, SI CMPL DX, $0x08 JL matchlen_single_standalone matchlen_loopback_standalone: - MOVQ (AX)(BP*1), BX - XORQ (CX)(BP*1), BX + MOVQ (AX)(SI*1), BX + XORQ (CX)(SI*1), BX TESTQ BX, BX JZ matchlen_loop_standalone BSFQ BX, BX SARQ $0x03, BX - LEAL (BP)(BX*1), BP + LEAL (SI)(BX*1), SI JMP gen_match_len_end matchlen_loop_standalone: LEAL -8(DX), DX - LEAL 8(BP), BP + LEAL 8(SI), SI CMPL DX, $0x08 JGE matchlen_loopback_standalone @@ -12924,13 +15666,13 @@ matchlen_single_standalone: JZ gen_match_len_end matchlen_single_loopback_standalone: - MOVB (AX)(BP*1), BL - CMPB (CX)(BP*1), BL + MOVB (AX)(SI*1), BL + CMPB (CX)(SI*1), BL JNE gen_match_len_end - LEAL 1(BP), BP + LEAL 1(SI), SI DECL DX JNZ matchlen_single_loopback_standalone gen_match_len_end: - MOVQ BP, ret+48(FP) + MOVQ SI, ret+48(FP) RET diff --git a/vendor/github.com/klauspost/compress/s2/s2.go b/vendor/github.com/klauspost/compress/s2/s2.go index a3e4abfe..89d69e96 100644 --- a/vendor/github.com/klauspost/compress/s2/s2.go +++ b/vendor/github.com/klauspost/compress/s2/s2.go @@ -90,6 +90,9 @@ const ( // Default block size defaultBlockSize = 1 << 20 + // maxSnappyBlockSize is the maximum snappy block size. + maxSnappyBlockSize = 1 << 16 + obufHeaderLen = checksumSize + chunkHeaderSize ) @@ -106,7 +109,7 @@ var crcTable = crc32.MakeTable(crc32.Castagnoli) // https://github.com/google/snappy/blob/master/framing_format.txt func crc(b []byte) uint32 { c := crc32.Update(0, crcTable, b) - return uint32(c>>15|c<<17) + 0xa282ead8 + return c>>15 | c<<17 + 0xa282ead8 } // literalExtraSize returns the extra size of encoding n literals. diff --git a/vendor/modules.txt b/vendor/modules.txt index 9e7626eb..7ae9396d 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1,6 +1,6 @@ # github.com/golang/protobuf v1.4.2 ## explicit -# github.com/klauspost/compress v1.11.12 +# github.com/klauspost/compress v1.13.4 ## explicit github.com/klauspost/compress/s2 # github.com/minio/highwayhash v1.0.1