sh: Add $'quoting' (C-style escape sequences).

A string between $' and ' may contain backslash escape sequences similar to the ones in a C string constant (except that a single-quote must be escaped and a double-quote need not be). Details are in the sh(1) man page. This construct is useful to include unprintable characters, tabs and newlines in strings; while this can be done with a command substitution containing a printf command, that needs ugly workarounds if the result is to end with a newline as command substitution removes all trailing newlines. The construct may also be useful in future to describe unprintable characters without needing to write those characters themselves in 'set -x', 'export -p' and the like. The implementation attempts to comply to the proposal for the next issue of the POSIX specification. Because this construct is not in POSIX.1-2008, using it in scripts intended to be portable is unwise. Matching the minimal locale support in the rest of sh, the \u and \U sequences are currently not useful. Exp-run done by: pav (with some other sh(1) changes)
author: jilles <jilles@FreeBSD.org> 2011-05-05 20:55:55 +0000
committer: jilles <jilles@FreeBSD.org> 2011-05-05 20:55:55 +0000
commit: 5a49f52603051288db09536911ad4c73579aeb99 (patch)
tree: f65bdb50369c1f6631b70c1264abbc69f9ce4513 /bin
parent: 7ec44d66a6202d6f9bf5a7ef04ba108896f96bea (diff)
download: FreeBSD-src-5a49f52603051288db09536911ad4c73579aeb99.zip
FreeBSD-src-5a49f52603051288db09536911ad4c73579aeb99.tar.gz
3 files changed, 212 insertions, 8 deletions
diff --git a/bin/sh/mksyntax.c b/bin/sh/mksyntax.c
index 6bd1390..9aa7450 100644
--- a/bin/sh/mksyntax.c
+++ b/bin/sh/mksyntax.c
@@ -64,6 +64,7 @@ struct synclass synclass[] = {
 	{ "CWORD",	"character is nothing special" },
 	{ "CNL",	"newline character" },
 	{ "CBACK",	"a backslash character" },
+	{ "CSBACK",	"a backslash character in single quotes" },
 	{ "CSQUOTE",	"single quote" },
 	{ "CDQUOTE",	"double quote" },
 	{ "CENDQUOTE",	"a terminating quote" },
@@ -224,6 +225,7 @@ main(int argc __unused, char **argv __unused)
 	init();
 	fputs("\n/* syntax table used when in single quotes */\n", cfile);
 	add("\n", "CNL");
+	add("\\", "CSBACK");
 	add("'", "CENDQUOTE");
 	/* ':/' for tilde expansion, '-' for [a\-x] pattern ranges */
 	add("!*?[=~:/-", "CCTL");
diff --git a/bin/sh/parser.c b/bin/sh/parser.c
index 43822f9..5133d67 100644
--- a/bin/sh/parser.c
+++ b/bin/sh/parser.c
@@ -1127,6 +1127,127 @@ done:
 
 
 /*
+ * Called to parse a backslash escape sequence inside $'...'.
+ * The backslash has already been read.
+ */
+static char *
+readcstyleesc(char *out)
+{
+	int c, v, i, n;
+
+	c = pgetc();
+	switch (c) {
+	case '\0':
+		synerror("Unterminated quoted string");
+	case '\n':
+		plinno++;
+		if (doprompt)
+			setprompt(2);
+		else
+			setprompt(0);
+		return out;
+	case '\\':
+	case '\'':
+	case '"':
+		v = c;
+		break;
+	case 'a': v = '\a'; break;
+	case 'b': v = '\b'; break;
+	case 'e': v = '\033'; break;
+	case 'f': v = '\f'; break;
+	case 'n': v = '\n'; break;
+	case 'r': v = '\r'; break;
+	case 't': v = '\t'; break;
+	case 'v': v = '\v'; break;
+	case 'x':
+		  v = 0;
+		  for (;;) {
+			  c = pgetc();
+			  if (c >= '0' && c <= '9')
+				  v = (v << 4) + c - '0';
+			  else if (c >= 'A' && c <= 'F')
+				  v = (v << 4) + c - 'A' + 10;
+			  else if (c >= 'a' && c <= 'f')
+				  v = (v << 4) + c - 'a' + 10;
+			  else
+				  break;
+		  }
+		  pungetc();
+		  break;
+	case '0': case '1': case '2': case '3':
+	case '4': case '5': case '6': case '7':
+		  v = c - '0';
+		  c = pgetc();
+		  if (c >= '0' && c <= '7') {
+			  v <<= 3;
+			  v += c - '0';
+			  c = pgetc();
+			  if (c >= '0' && c <= '7') {
+				  v <<= 3;
+				  v += c - '0';
+			  } else
+				  pungetc();
+		  } else
+			  pungetc();
+		  break;
+	case 'c':
+		  c = pgetc();
+		  if (c < 0x3f || c > 0x7a || c == 0x60)
+			  synerror("Bad escape sequence");
+		  if (c == '\\' && pgetc() != '\\')
+			  synerror("Bad escape sequence");
+		  if (c == '?')
+			  v = 127;
+		  else
+			  v = c & 0x1f;
+		  break;
+	case 'u':
+	case 'U':
+		  n = c == 'U' ? 8 : 4;
+		  v = 0;
+		  for (i = 0; i < n; i++) {
+			  c = pgetc();
+			  if (c >= '0' && c <= '9')
+				  v = (v << 4) + c - '0';
+			  else if (c >= 'A' && c <= 'F')
+				  v = (v << 4) + c - 'A' + 10;
+			  else if (c >= 'a' && c <= 'f')
+				  v = (v << 4) + c - 'a' + 10;
+			  else
+				  synerror("Bad escape sequence");
+		  }
+		  if (v == 0 || (v >= 0xd800 && v <= 0xdfff))
+			  synerror("Bad escape sequence");
+		  /* We really need iconv here. */
+		  if (v > 127)
+			  v = '?';
+		  break;
+	default:
+		  synerror("Bad escape sequence");
+	}
+	v = (char)v;
+	/*
+	 * We can't handle NUL bytes.
+	 * POSIX says we should skip till the closing quote.
+	 */
+	if (v == '\0') {
+		while ((c = pgetc()) != '\'') {
+			if (c == '\\')
+				c = pgetc();
+			if (c == PEOF)
+				synerror("Unterminated quoted string");
+		}
+		pungetc();
+		return out;
+	}
+	if (SQSYNTAX[v] == CCTL)
+		USTPUTC(CTLESC, out);
+	USTPUTC(v, out);
+	return out;
+}
+
+
+/*
  * If eofmark is NULL, read a word or a redirection symbol.  If eofmark
  * is not NULL, read a here document.  In the latter case, eofmark is the
  * word which marks the end of the document and striptabs is true if
@@ -1158,6 +1279,7 @@ readtoken1(int firstc, char const *initialsyntax, char *eofmark, int striptabs)
 	struct tokenstate state_static[MAXNEST_static];
 	int maxnest = MAXNEST_static;
 	struct tokenstate *state = state_static;
+	int sqiscstyle = 0;
 
 	startlinno = plinno;
 	quotef = 0;
@@ -1188,6 +1310,12 @@ readtoken1(int firstc, char const *initialsyntax, char *eofmark, int striptabs)
 					setprompt(0);
 				c = pgetc();
 				goto loop;		/* continue outer loop */
+			case CSBACK:
+				if (sqiscstyle) {
+					out = readcstyleesc(out);
+					break;
+				}
+				/* FALLTHROUGH */
 			case CWORD:
 				USTPUTC(c, out);
 				break;
@@ -1232,6 +1360,7 @@ readtoken1(int firstc, char const *initialsyntax, char *eofmark, int striptabs)
 			case CSQUOTE:
 				USTPUTC(CTLQUOTEMARK, out);
 				state[level].syntax = SQSYNTAX;
+				sqiscstyle = 0;
 				break;
 			case CDQUOTE:
 				USTPUTC(CTLQUOTEMARK, out);
@@ -1450,11 +1579,7 @@ parsesub: {
 	int c1;
 
 	c = pgetc();
-	if (c != '(' && c != '{' && (is_eof(c) || !is_name(c)) &&
-	    !is_special(c)) {
-		USTPUTC('$', out);
-		pungetc();
-	} else if (c == '(') {	/* $(command) or $((arith)) */
+	if (c == '(') {	/* $(command) or $((arith)) */
 		if (pgetc() == '(') {
 			PARSEARITH();
 		} else {
@@ -1465,7 +1590,7 @@ parsesub: {
 			    state[level].syntax == DQSYNTAX ||
 			    state[level].syntax == ARISYNTAX);
 		}
-	} else {
+	} else if (c == '{' || is_name(c) || is_special(c)) {
 		USTPUTC(CTLVAR, out);
 		typeloc = out - stackblock();
 		USTPUTC(VSNORMAL, out);
@@ -1612,6 +1737,14 @@ varname:
 				newvarnest++;
 			}
 		}
+	} else if (c == '\'' && state[level].syntax == BASESYNTAX) {
+		/* $'cstylequotes' */
+		USTPUTC(CTLQUOTEMARK, out);
+		state[level].syntax = SQSYNTAX;
+		sqiscstyle = 1;
+	} else {
+		USTPUTC('$', out);
+		pungetc();
 	}
 	goto parsesub_return;
 }
diff --git a/bin/sh/sh.1 b/bin/sh/sh.1
index e9f34fb..e240d7d 100644
--- a/bin/sh/sh.1
+++ b/bin/sh/sh.1
@@ -32,7 +32,7 @@
 .\"	from: @(#)sh.1	8.6 (Berkeley) 5/4/95
 .\" $FreeBSD$
 .\"
-.Dd March 20, 2011
+.Dd May 5, 2011
 .Dt SH 1
 .Os
 .Sh NAME
@@ -396,13 +396,82 @@ Quoting is used to remove the special meaning of certain characters
 or words to the shell, such as operators, whitespace, keywords,
 or alias names.
 .Pp
-There are three types of quoting: matched single quotes,
+There are four types of quoting: matched single quotes,
+dollar-single quotes,
 matched double quotes, and backslash.
 .Bl -tag -width indent
 .It Single Quotes
 Enclosing characters in single quotes preserves the literal
 meaning of all the characters (except single quotes, making
 it impossible to put single-quotes in a single-quoted string).
+.It Dollar-Single Quotes
+Enclosing characters between
+.Li $'
+and
+.Li '
+preserves the literal meaning of all characters
+except backslashes and single quotes.
+A backslash introduces a C-style escape sequence:
+.Bl -tag -width xUnnnnnnnn
+.It \ea
+Alert (ring the terminal bell)
+.It \eb
+Backspace
+.It \ec Ns Ar c
+The control character denoted by
+.Li ^ Ns Ar c
+in
+.Xr stty 1 .
+If
+.Ar c
+is a backslash, it must be doubled.
+.It \ee
+The ESC character
+.Tn ( ASCII
+0x1b)
+.It \ef
+Formfeed
+.It \en
+Newline
+.It \er
+Carriage return
+.It \et
+Horizontal tab
+.It \ev
+Vertical tab
+.It \e\e
+Literal backslash
+.It \e\&'
+Literal single-quote
+.It \e\&"
+Literal double-quote
+.It \e Ns Ar nnn
+The byte whose octal value is
+.Ar nnn
+(one to three digits)
+.It \ex Ns Ar nn
+The byte whose hexadecimal value is
+.Ar nn
+(one or more digits only the last two of which are used)
+.It \eu Ns Ar nnnn
+The Unicode code point
+.Ar nnnn
+(four hexadecimal digits)
+.It \eU Ns Ar nnnnnnnn
+The Unicode code point
+.Ar nnnnnnnn
+(eight hexadecimal digits)
+.El
+.Pp
+The sequences for Unicode code points currently only provide useful results
+for values below 128.
+They reject code point 0 and UTF-16 surrogates.
+.Pp
+If an escape sequence would produce a byte with value 0,
+that byte and the rest of the string until the matching single-quote
+are ignored.
+.Pp
+Any other string starting with a backslash is an error.
 .It Double Quotes
 Enclosing characters within double quotes preserves the literal
 meaning of all characters except dollar sign
author	jilles <jilles@FreeBSD.org>	2011-05-05 20:55:55 +0000
committer	jilles <jilles@FreeBSD.org>	2011-05-05 20:55:55 +0000
commit	5a49f52603051288db09536911ad4c73579aeb99 (patch)
tree	f65bdb50369c1f6631b70c1264abbc69f9ce4513 /bin
parent	7ec44d66a6202d6f9bf5a7ef04ba108896f96bea (diff)
download	FreeBSD-src-5a49f52603051288db09536911ad4c73579aeb99.zip FreeBSD-src-5a49f52603051288db09536911ad4c73579aeb99.tar.gz