mirror of
				https://github.com/taigrr/wtf
				synced 2025-01-18 04:03:14 -08:00 
			
		
		
		
	
		
			
				
	
	
		
			197 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			197 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
// Copyright 2015 Garrett D'Amore
 | 
						|
//
 | 
						|
// Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
// you may not use file except in compliance with the License.
 | 
						|
// You may obtain a copy of the license at
 | 
						|
//
 | 
						|
//    http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
//
 | 
						|
// Unless required by applicable law or agreed to in writing, software
 | 
						|
// distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
// See the License for the specific language governing permissions and
 | 
						|
// limitations under the License.
 | 
						|
 | 
						|
package encoding
 | 
						|
 | 
						|
import (
 | 
						|
	"sync"
 | 
						|
	"unicode/utf8"
 | 
						|
 | 
						|
	"golang.org/x/text/encoding"
 | 
						|
	"golang.org/x/text/transform"
 | 
						|
)
 | 
						|
 | 
						|
const (
 | 
						|
	// RuneError is an alias for the UTF-8 replacement rune, '\uFFFD'.
 | 
						|
	RuneError = '\uFFFD'
 | 
						|
 | 
						|
	// RuneSelf is the rune below which UTF-8 and the Unicode values are
 | 
						|
	// identical.  Its also the limit for ASCII.
 | 
						|
	RuneSelf = 0x80
 | 
						|
 | 
						|
	// ASCIISub is the ASCII substitution character.
 | 
						|
	ASCIISub = '\x1a'
 | 
						|
)
 | 
						|
 | 
						|
// Charmap is a structure for setting up encodings for 8-bit character sets,
 | 
						|
// for transforming between UTF8 and that other character set.  It has some
 | 
						|
// ideas borrowed from golang.org/x/text/encoding/charmap, but it uses a
 | 
						|
// different implementation.  This implementation uses maps, and supports
 | 
						|
// user-defined maps.
 | 
						|
//
 | 
						|
// We do assume that a character map has a reasonable substitution character,
 | 
						|
// and that valid encodings are stable (exactly a 1:1 map) and stateless
 | 
						|
// (that is there is no shift character or anything like that.)  Hence this
 | 
						|
// approach will not work for many East Asian character sets.
 | 
						|
//
 | 
						|
// Measurement shows little or no measurable difference in the performance of
 | 
						|
// the two approaches.  The difference was down to a couple of nsec/op, and
 | 
						|
// no consistent pattern as to which ran faster.  With the conversion to
 | 
						|
// UTF-8 the code takes about 25 nsec/op.  The conversion in the reverse
 | 
						|
// direction takes about 100 nsec/op.  (The larger cost for conversion
 | 
						|
// from UTF-8 is most likely due to the need to convert the UTF-8 byte stream
 | 
						|
// to a rune before conversion.
 | 
						|
//
 | 
						|
type Charmap struct {
 | 
						|
	transform.NopResetter
 | 
						|
	bytes map[rune]byte
 | 
						|
	runes [256][]byte
 | 
						|
	once  sync.Once
 | 
						|
 | 
						|
	// The map between bytes and runes.  To indicate that a specific
 | 
						|
	// byte value is invalid for a charcter set, use the rune
 | 
						|
	// utf8.RuneError.  Values that are absent from this map will
 | 
						|
	// be assumed to have the identity mapping -- that is the default
 | 
						|
	// is to assume ISO8859-1, where all 8-bit characters have the same
 | 
						|
	// numeric value as their Unicode runes.  (Not to be confused with
 | 
						|
	// the UTF-8 values, which *will* be different for non-ASCII runes.)
 | 
						|
	//
 | 
						|
	// If no values less than RuneSelf are changed (or have non-identity
 | 
						|
	// mappings), then the character set is assumed to be an ASCII
 | 
						|
	// superset, and certain assumptions and optimizations become
 | 
						|
	// available for ASCII bytes.
 | 
						|
	Map map[byte]rune
 | 
						|
 | 
						|
	// The ReplacementChar is the byte value to use for substitution.
 | 
						|
	// It should normally be ASCIISub for ASCII encodings.  This may be
 | 
						|
	// unset (left to zero) for mappings that are strictly ASCII supersets.
 | 
						|
	// In that case ASCIISub will be assumed instead.
 | 
						|
	ReplacementChar byte
 | 
						|
}
 | 
						|
 | 
						|
type cmapDecoder struct {
 | 
						|
	transform.NopResetter
 | 
						|
	runes [256][]byte
 | 
						|
}
 | 
						|
 | 
						|
type cmapEncoder struct {
 | 
						|
	transform.NopResetter
 | 
						|
	bytes   map[rune]byte
 | 
						|
	replace byte
 | 
						|
}
 | 
						|
 | 
						|
// Init initializes internal values of a character map.  This should
 | 
						|
// be done early, to minimize the cost of allocation of transforms
 | 
						|
// later.  It is not strictly necessary however, as the allocation
 | 
						|
// functions will arrange to call it if it has not already been done.
 | 
						|
func (c *Charmap) Init() {
 | 
						|
	c.once.Do(c.initialize)
 | 
						|
}
 | 
						|
 | 
						|
func (c *Charmap) initialize() {
 | 
						|
	c.bytes = make(map[rune]byte)
 | 
						|
	ascii := true
 | 
						|
 | 
						|
	for i := 0; i < 256; i++ {
 | 
						|
		r, ok := c.Map[byte(i)]
 | 
						|
		if !ok {
 | 
						|
			r = rune(i)
 | 
						|
		}
 | 
						|
		if r < 128 && r != rune(i) {
 | 
						|
			ascii = false
 | 
						|
		}
 | 
						|
		if r != RuneError {
 | 
						|
			c.bytes[r] = byte(i)
 | 
						|
		}
 | 
						|
		utf := make([]byte, utf8.RuneLen(r))
 | 
						|
		utf8.EncodeRune(utf, r)
 | 
						|
		c.runes[i] = utf
 | 
						|
	}
 | 
						|
	if ascii && c.ReplacementChar == '\x00' {
 | 
						|
		c.ReplacementChar = ASCIISub
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// NewDecoder returns a Decoder the converts from the 8-bit
 | 
						|
// character set to UTF-8.  Unknown mappings, if any, are mapped
 | 
						|
// to '\uFFFD'.
 | 
						|
func (c *Charmap) NewDecoder() *encoding.Decoder {
 | 
						|
	c.Init()
 | 
						|
	return &encoding.Decoder{Transformer: &cmapDecoder{runes: c.runes}}
 | 
						|
}
 | 
						|
 | 
						|
// NewEncoder returns a Transformer that converts from UTF8 to the
 | 
						|
// 8-bit character set.  Unknown mappings are mapped to 0x1A.
 | 
						|
func (c *Charmap) NewEncoder() *encoding.Encoder {
 | 
						|
	c.Init()
 | 
						|
	return &encoding.Encoder{
 | 
						|
		Transformer: &cmapEncoder{
 | 
						|
			bytes:   c.bytes,
 | 
						|
			replace: c.ReplacementChar,
 | 
						|
		},
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
func (d *cmapDecoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
 | 
						|
	var e error
 | 
						|
	var ndst, nsrc int
 | 
						|
 | 
						|
	for _, c := range src {
 | 
						|
		b := d.runes[c]
 | 
						|
		l := len(b)
 | 
						|
 | 
						|
		if ndst+l > len(dst) {
 | 
						|
			e = transform.ErrShortDst
 | 
						|
			break
 | 
						|
		}
 | 
						|
		for i := 0; i < l; i++ {
 | 
						|
			dst[ndst] = b[i]
 | 
						|
			ndst++
 | 
						|
		}
 | 
						|
		nsrc++
 | 
						|
	}
 | 
						|
	return ndst, nsrc, e
 | 
						|
}
 | 
						|
 | 
						|
func (d *cmapEncoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
 | 
						|
	var e error
 | 
						|
	var ndst, nsrc int
 | 
						|
	for nsrc < len(src) {
 | 
						|
		if ndst >= len(dst) {
 | 
						|
			e = transform.ErrShortDst
 | 
						|
			break
 | 
						|
		}
 | 
						|
 | 
						|
		r, sz := utf8.DecodeRune(src[nsrc:])
 | 
						|
		if r == utf8.RuneError && sz == 1 {
 | 
						|
			// If its inconclusive due to insufficient data in
 | 
						|
			// in the source, report it
 | 
						|
			if !atEOF && !utf8.FullRune(src[nsrc:]) {
 | 
						|
				e = transform.ErrShortSrc
 | 
						|
				break
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
		if c, ok := d.bytes[r]; ok {
 | 
						|
			dst[ndst] = c
 | 
						|
		} else {
 | 
						|
			dst[ndst] = d.replace
 | 
						|
		}
 | 
						|
		nsrc += sz
 | 
						|
		ndst++
 | 
						|
	}
 | 
						|
 | 
						|
	return ndst, nsrc, e
 | 
						|
}
 |