Issue 477: Improve handling of escaped substrings

This involves evaluating and interpolating \-escaped sequences in double-quoted and undelimited strings. Added tests on various edge-conditions, as well as ruling out bare strings like '\true' as a bool.
2026-04-02 03:38:42 -07:00 · 2017-04-27 13:24:09 -07:00
parent a94b5e0f4c
commit 122b697e41
2 changed files with 235 additions and 37 deletions
--- a/conf/lex.go
+++ b/conf/lex.go
@@ -16,6 +16,7 @@
 package conf

 import (
+	"encoding/hex"
 	"fmt"
 	"strings"
 	"unicode"
@@ -82,6 +83,10 @@ type lexer struct {
 	// nested arrays. The last state on the stack is used after a value has
 	// been lexed. Similarly for comments.
 	stack []stateFn
+
+	// Used for processing escapable substrings in double-quoted and raw strings
+	stringParts   []string
+	stringStateFn stateFn
 }

 type item struct {
@@ -103,11 +108,12 @@ func (lx *lexer) nextItem() item {

 func lex(input string) *lexer {
 	lx := &lexer{
-		input: input,
-		state: lexTop,
-		line:  1,
-		items: make(chan item, 10),
-		stack: make([]stateFn, 0, 10),
+		input:       input,
+		state:       lexTop,
+		line:        1,
+		items:       make(chan item, 10),
+		stack:       make([]stateFn, 0, 10),
+		stringParts: []string{},
 	}
 	return lx
 }
@@ -127,10 +133,37 @@ func (lx *lexer) pop() stateFn {
 }

 func (lx *lexer) emit(typ itemType) {
-	lx.items <- item{typ, lx.input[lx.start:lx.pos], lx.line}
+	lx.items <- item{typ, strings.Join(lx.stringParts, "") + lx.input[lx.start:lx.pos], lx.line}
 	lx.start = lx.pos
 }

+func (lx *lexer) emitString() {
+	var finalString string
+	if len(lx.stringParts) > 0 {
+		finalString = strings.Join(lx.stringParts, "") + lx.input[lx.start:lx.pos]
+		lx.stringParts = []string{}
+	} else {
+		finalString = lx.input[lx.start:lx.pos]
+	}
+	lx.items <- item{itemString, finalString, lx.line}
+	lx.start = lx.pos
+}
+
+func (lx *lexer) addCurrentStringPart(offset int) {
+	lx.stringParts = append(lx.stringParts, lx.input[lx.start:lx.pos-offset])
+	lx.start = lx.pos
+}
+
+func (lx *lexer) addStringPart(s string) stateFn {
+	lx.stringParts = append(lx.stringParts, s)
+	lx.start = lx.pos
+	return lx.stringStateFn
+}
+
+func (lx *lexer) hasEscapedParts() bool {
+	return len(lx.stringParts) > 0
+}
+
 func (lx *lexer) next() (r rune) {
 	if lx.pos >= len(lx.input) {
 		lx.width = 0
@@ -453,6 +486,7 @@ func lexValue(lx *lexer) stateFn {
 		return lexQuotedString
 	case r == dqStringStart:
 		lx.ignore() // ignore the " or '
+		lx.stringStateFn = lexDubQuotedString
 		return lexDubQuotedString
 	case r == '-':
 		return lexNegNumberStart
@@ -468,6 +502,7 @@ func lexValue(lx *lexer) stateFn {
 		return lx.errorf("Expected value but found new line")
 	}
 	lx.backup()
+	lx.stringStateFn = lexString
 	return lexString
 }

@@ -721,9 +756,12 @@ func lexQuotedString(lx *lexer) stateFn {
 func lexDubQuotedString(lx *lexer) stateFn {
 	r := lx.next()
 	switch {
+	case r == '\\':
+		lx.addCurrentStringPart(1)
+		return lexStringEscape
 	case r == dqStringEnd:
 		lx.backup()
-		lx.emit(itemString)
+		lx.emitString()
 		lx.next()
 		lx.ignore()
 		return lx.pop()
@@ -736,6 +774,7 @@ func lexString(lx *lexer) stateFn {
 	r := lx.next()
 	switch {
 	case r == '\\':
+		lx.addCurrentStringPart(1)
 		return lexStringEscape
 	// Termination of non-quoted strings
 	case isNL(r) || r == eof || r == optValTerm ||
@@ -743,17 +782,19 @@ func lexString(lx *lexer) stateFn {
 		isWhitespace(r):

 		lx.backup()
-		if lx.isBool() {
+		if lx.hasEscapedParts() {
+			lx.emitString()
+		} else if lx.isBool() {
 			lx.emit(itemBool)
 		} else if lx.isVariable() {
 			lx.emit(itemVariable)
 		} else {
-			lx.emit(itemString)
+			lx.emitString()
 		}
 		return lx.pop()
 	case r == sqStringEnd:
 		lx.backup()
-		lx.emit(itemString)
+		lx.emitString()
 		lx.next()
 		lx.ignore()
 		return lx.pop()
@@ -803,15 +844,15 @@ func lexStringEscape(lx *lexer) stateFn {
 	case 'x':
 		return lexStringBinary
 	case 't':
-		fallthrough
+		return lx.addStringPart("\t")
 	case 'n':
-		fallthrough
+		return lx.addStringPart("\n")
 	case 'r':
-		fallthrough
+		return lx.addStringPart("\r")
 	case '"':
-		fallthrough
+		return lx.addStringPart("\"")
 	case '\\':
-		return lexString
+		return lx.addStringPart("\\")
 	}
 	return lx.errorf("Invalid escape character '%v'. Only the following "+
 		"escape characters are allowed: \\xXX, \\t, \\n, \\r, \\\", \\\\.", r)
@@ -821,17 +862,20 @@ func lexStringEscape(lx *lexer) stateFn {
 // that the '\x' has already been consumed.
 func lexStringBinary(lx *lexer) stateFn {
 	r := lx.next()
-	if !isHexadecimal(r) {
-		return lx.errorf("Expected two hexadecimal digits after '\\x', but "+
-			"got '%v' instead.", r)
+	if isNL(r) {
+		return lx.errorf("Expected two hexadecimal digits after '\\x', but hit end of line")
 	}
-
 	r = lx.next()
-	if !isHexadecimal(r) {
-		return lx.errorf("Expected two hexadecimal digits after '\\x', but "+
-			"got '%v' instead.", r)
+	if isNL(r) {
+		return lx.errorf("Expected two hexadecimal digits after '\\x', but hit end of line")
 	}
-	return lexString
+	offset := lx.pos - 2
+	byteString, err := hex.DecodeString(lx.input[offset:lx.pos])
+	if err != nil {
+		return lx.errorf("Expected two hexadecimal digits after '\\x', but got '%s'", lx.input[offset:lx.pos])
+	}
+	lx.addStringPart(string(byteString))
+	return lx.stringStateFn
 }

 // lexNumberOrDateStart consumes either a (positive) integer, a float, a datetime, or IP.
@@ -1028,12 +1072,6 @@ func isNL(r rune) bool {
 	return r == '\n' || r == '\r'
 }

-func isHexadecimal(r rune) bool {
-	return (r >= '0' && r <= '9') ||
-		(r >= 'a' && r <= 'f') ||
-		(r >= 'A' && r <= 'F')
-}
-
 func (itype itemType) String() string {
 	switch itype {
 	case itemError:
--- a/conf/lex_test.go
+++ b/conf/lex_test.go
@@ -65,10 +65,20 @@ func TestComplexStringValues(t *testing.T) {
 func TestBinaryString(t *testing.T) {
 	expectedItems := []item{
 		{itemKey, "foo", 1},
-		{itemString, "\\x22", 1},
+		{itemString, "e", 1},
 		{itemEOF, "", 1},
 	}
-	lx := lex("foo = \\x22")
+	lx := lex("foo = \\x65")
+	expect(t, lx, expectedItems)
+}
+
+func TestBinaryStringLatin1(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemString, "\xe9", 1},
+		{itemEOF, "", 1},
+	}
+	lx := lex("foo = \\xe9")
 	expect(t, lx, expectedItems)
 }

@@ -197,6 +207,46 @@ func TestSimpleKeyFloatValues(t *testing.T) {
 	expect(t, lx, expectedItems)
 }

+func TestBadBinaryStringEndingAfterZeroHexChars(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemError, "Expected two hexadecimal digits after '\\x', but hit end of line", 2},
+		{itemEOF, "", 1},
+	}
+	lx := lex("foo = xyz\\x\n")
+	expect(t, lx, expectedItems)
+}
+
+func TestBadBinaryStringEndingAfterOneHexChar(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemError, "Expected two hexadecimal digits after '\\x', but hit end of line", 2},
+		{itemEOF, "", 1},
+	}
+	lx := lex("foo = xyz\\xF\n")
+	expect(t, lx, expectedItems)
+}
+
+func TestBadBinaryStringWithZeroHexChars(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemError, "Expected two hexadecimal digits after '\\x', but got ']\"'", 1},
+		{itemEOF, "", 1},
+	}
+	lx := lex(`foo = "[\x]"`)
+	expect(t, lx, expectedItems)
+}
+
+func TestBadBinaryStringWithOneHexChar(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemError, "Expected two hexadecimal digits after '\\x', but got 'e]'", 1},
+		{itemEOF, "", 1},
+	}
+	lx := lex(`foo = "[\xe]"`)
+	expect(t, lx, expectedItems)
+}
+
 func TestBadFloatValues(t *testing.T) {
 	expectedItems := []item{
 		{itemKey, "foo", 1},
@@ -269,7 +319,7 @@ func TestRawString(t *testing.T) {
 	lx := lex("foo = bar")
 	expect(t, lx, expectedItems)

-	lx = lex(`foo = bar' `)
+	lx = lex(`foo = bar' `) //'single-quote for emacs TODO: Remove me
 	expect(t, lx, expectedItems)
 }

@@ -523,21 +573,131 @@ bs   = \\
 func TestEscapedString(t *testing.T) {
 	expectedItems := []item{
 		{itemKey, "foo", 2},
-		{itemString, `\t`, 2},
+		{itemString, "\t", 2},
 		{itemKey, "bar", 3},
-		{itemString, `\r`, 3},
+		{itemString, "\r", 3},
 		{itemKey, "baz", 4},
-		{itemString, `\n`, 4},
+		{itemString, "\n", 4},
 		{itemKey, "q", 5},
-		{itemString, `\"`, 5},
+		{itemString, "\"", 5},
 		{itemKey, "bs", 6},
-		{itemString, `\\`, 6},
+		{itemString, "\\", 6},
 		{itemEOF, "", 6},
 	}
 	lx := lex(escString)
 	expect(t, lx, expectedItems)
 }

+func TestCompoundStringES(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemString, "\\end", 1},
+		{itemEOF, "", 2},
+	}
+	lx := lex(`foo = "\\end"`)
+	expect(t, lx, expectedItems)
+}
+
+func TestCompoundStringSE(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemString, "start\\", 1},
+		{itemEOF, "", 2},
+	}
+	lx := lex(`foo = "start\\"`)
+	expect(t, lx, expectedItems)
+}
+
+func TestCompoundStringEE(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemString, "Eq", 1},
+		{itemEOF, "", 2},
+	}
+	lx := lex(`foo = \x45\x71`)
+	expect(t, lx, expectedItems)
+}
+
+func TestCompoundStringSEE(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemString, "startEq", 1},
+		{itemEOF, "", 2},
+	}
+	lx := lex(`foo = start\x45\x71`)
+	expect(t, lx, expectedItems)
+}
+
+func TestCompoundStringSES(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemString, "start|end", 1},
+		{itemEOF, "", 2},
+	}
+	lx := lex(`foo = start\x7Cend`)
+	expect(t, lx, expectedItems)
+}
+
+func TestCompoundStringEES(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemString, "<>end", 1},
+		{itemEOF, "", 2},
+	}
+	lx := lex(`foo = \x3c\x3eend`)
+	expect(t, lx, expectedItems)
+}
+
+func TestCompoundStringESE(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemString, "<middle>", 1},
+		{itemEOF, "", 2},
+	}
+	lx := lex(`foo = \x3cmiddle\x3E`)
+	expect(t, lx, expectedItems)
+}
+
+func TestNonBool(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemString, "\\true", 1},
+		{itemEOF, "", 2},
+	}
+	lx := lex(`foo = \\true`)
+	expect(t, lx, expectedItems)
+}
+
+func TestNonVariable(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemString, "\\$var", 1},
+		{itemEOF, "", 2},
+	}
+	lx := lex(`foo = \\$var`)
+	expect(t, lx, expectedItems)
+}
+
+func TestEmptyStringDQ(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemString, "", 1},
+		{itemEOF, "", 2},
+	}
+	lx := lex(`foo = ""`)
+	expect(t, lx, expectedItems)
+}
+
+func TestEmptyStringSQ(t *testing.T) {
+	expectedItems := []item{
+		{itemKey, "foo", 1},
+		{itemString, "", 1},
+		{itemEOF, "", 2},
+	}
+	lx := lex(`foo = ''`)
+	expect(t, lx, expectedItems)
+}
+
 var nestedWhitespaceMap = `
 foo  {
  host  {