Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions Lib/test/test_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@

class TokenTests(unittest.TestCase):

check_syntax_error = check_syntax_error

def test_backslash(self):
# Backslash means line continuation:
x = 1 \
Expand Down Expand Up @@ -184,6 +186,28 @@ def test_underscore_literals(self):
# Sanity check: no literal begins with an underscore
self.assertRaises(NameError, eval, "_0")

def test_bad_numerical_literals(self):
check = self.check_syntax_error
check("0b12", "invalid digit '2' in binary literal")
check("0b1_2", "invalid digit '2' in binary literal")
check("0b2", "invalid digit '2' in binary literal")
check("0b1_", "invalid binary literal")
check("0b", "invalid binary literal")
check("0o18", "invalid digit '8' in octal literal")
check("0o1_8", "invalid digit '8' in octal literal")
check("0o8", "invalid digit '8' in octal literal")
check("0o1_", "invalid octal literal")
check("0o", "invalid octal literal")
check("0x1_", "invalid hexadecimal literal")
check("0x", "invalid hexadecimal literal")
check("1_", "invalid decimal literal")
check("012",
"leading zeros in decimal integer literals are not permitted; "
"use an 0o prefix for octal integers")
check("1.2_", "invalid decimal literal")
check("1e2_", "invalid decimal literal")
check("1e+", "invalid decimal literal")

def test_string_literals(self):
x = ''; y = ""; self.assertTrue(len(x) == 0 and x == y)
x = '\''; y = "'"; self.assertTrue(len(x) == 1 and x == y and ord(x) == 39)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improved syntax error messages for invalid numerical literals.
65 changes: 52 additions & 13 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1280,6 +1280,28 @@ PyToken_ThreeChars(int c1, int c2, int c3)
return OP;
}

static int
syntaxerror(struct tok_state *tok, const char *format, ...)
{
#ifndef PGEN
va_list vargs;
#ifdef HAVE_STDARG_PROTOTYPES
va_start(vargs, format);
#else
va_start(vargs);
#endif
PyErr_FormatV(PyExc_SyntaxError, format, vargs);
va_end(vargs);
PyErr_SyntaxLocationObject(tok->filename,
tok->lineno,
tok->cur - tok->line_start);
tok->done = E_ERROR;
#else
tok->done = E_TOKEN;
#endif
return ERRORTOKEN;
}

static int
indenterror(struct tok_state *tok)
{
Expand Down Expand Up @@ -1333,8 +1355,8 @@ tok_decimal_tail(struct tok_state *tok)
}
c = tok_nextc(tok);
if (!isdigit(c)) {
tok->done = E_TOKEN;
tok_backup(tok, c);
syntaxerror(tok, "invalid decimal literal");
return 0;
}
}
Expand Down Expand Up @@ -1562,9 +1584,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
c = tok_nextc(tok);
}
if (!isxdigit(c)) {
tok->done = E_TOKEN;
tok_backup(tok, c);
return ERRORTOKEN;
return syntaxerror(tok, "invalid hexadecimal literal");
}
do {
c = tok_nextc(tok);
Expand All @@ -1579,14 +1600,23 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
c = tok_nextc(tok);
}
if (c < '0' || c >= '8') {
tok->done = E_TOKEN;
tok_backup(tok, c);
return ERRORTOKEN;
if (isdigit(c)) {
return syntaxerror(tok,
"invalid digit '%c' in octal literal", c);
}
else {
return syntaxerror(tok, "invalid octal literal");
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be possible to add the value of c in this message as well ?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This error is raised in the case if an underscore or 0o is not followed by a digit. What error messages could be helpful for 0o+2, 0o + 2, (2+0o), 0or[]?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know, I'd expect "invalid character '%c' in octal literal" to be useful in all cases.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is easy to report only if an invalid digit (in the range 2-9 or 8-9) is occurred. In general case there are much subtle details, handling them will complicate the code too much:

  • Not always an invalid character exists. This error can be raised at the end of the input.
  • It can be non-ASCII. In this case we need to decode a multibyte UTF-8 for getting a character.
  • It can be non-printable.
  • Even if it is printable from the Unicode's point of view, it can look indistinguishably from other characters. For example, non-breakable space character looks like an ordinary space for humans, but not for the Python parser.
  • Even in ASCII there are non-printable characters, or characters that need special handling: tab, newline, single quote, backslash, ...

It may be worth to produce more specialized error message for some cases, but just reporting the next invalid character is no a way.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ho, indeed, I didn't think about all these issues...

}
}
do {
c = tok_nextc(tok);
} while ('0' <= c && c < '8');
} while (c == '_');
if (isdigit(c)) {
return syntaxerror(tok,
"invalid digit '%c' in octal literal", c);
}
}
else if (c == 'b' || c == 'B') {
/* Binary */
Expand All @@ -1596,14 +1626,23 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
c = tok_nextc(tok);
}
if (c != '0' && c != '1') {
tok->done = E_TOKEN;
tok_backup(tok, c);
return ERRORTOKEN;
if (isdigit(c)) {
return syntaxerror(tok,
"invalid digit '%c' in binary literal", c);
}
else {
return syntaxerror(tok, "invalid binary literal");
}
}
do {
c = tok_nextc(tok);
} while (c == '0' || c == '1');
} while (c == '_');
if (isdigit(c)) {
return syntaxerror(tok,
"invalid digit '%c' in binary literal", c);
}
}
else {
int nonzero = 0;
Expand All @@ -1613,9 +1652,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
if (c == '_') {
c = tok_nextc(tok);
if (!isdigit(c)) {
tok->done = E_TOKEN;
tok_backup(tok, c);
return ERRORTOKEN;
return syntaxerror(tok, "invalid decimal literal");
}
}
if (c != '0') {
Expand All @@ -1642,9 +1680,11 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
}
else if (nonzero) {
/* Old-style octal: now disallowed. */
tok->done = E_TOKEN;
tok_backup(tok, c);
return ERRORTOKEN;
return syntaxerror(tok,
"leading zeros in decimal integer "
"literals are not permitted; "
"use an 0o prefix for octal integers");
}
}
}
Expand Down Expand Up @@ -1676,9 +1716,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
if (c == '+' || c == '-') {
c = tok_nextc(tok);
if (!isdigit(c)) {
tok->done = E_TOKEN;
tok_backup(tok, c);
return ERRORTOKEN;
return syntaxerror(tok, "invalid decimal literal");
}
} else if (!isdigit(c)) {
tok_backup(tok, c);
Expand Down