-
-
Notifications
You must be signed in to change notification settings - Fork 34.5k
bpo-32285: Add unicodedata.is_normalized to check the current norma…
#4806
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
eea1543
7a4076c
591abc0
1ecc284
8db1a3c
fb401d5
cf2a177
15be04f
697e35b
25f0623
bd823e5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| New function unicodedata.is_normalized, which can check whether a string is | ||
| in a specific normal form. |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,11 @@ | |
| #include "ucnhash.h" | ||
| #include "structmember.h" | ||
|
|
||
| _Py_IDENTIFIER(NFC); | ||
| _Py_IDENTIFIER(NFD); | ||
| _Py_IDENTIFIER(NFKC); | ||
| _Py_IDENTIFIER(NFKD); | ||
|
|
||
| /*[clinic input] | ||
| module unicodedata | ||
| class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type' | ||
|
|
@@ -770,8 +775,10 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) | |
| return result; | ||
| } | ||
|
|
||
| /* Return 1 if the input is certainly normalized, 0 if it might not be. */ | ||
| static int | ||
| typedef enum {YES, NO, MAYBE} NormalMode; | ||
|
|
||
| /* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */ | ||
| static NormalMode | ||
| is_normalized(PyObject *self, PyObject *input, int nfc, int k) | ||
| { | ||
| Py_ssize_t i, len; | ||
|
|
@@ -782,7 +789,7 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k) | |
| /* An older version of the database is requested, quickchecks must be | ||
| disabled. */ | ||
| if (self && UCD_Check(self)) | ||
| return 0; | ||
| return NO; | ||
|
|
||
| /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, | ||
| as described in http://unicode.org/reports/tr15/#Annex8. */ | ||
|
|
@@ -799,19 +806,92 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k) | |
| unsigned char quickcheck = record->normalization_quick_check; | ||
|
|
||
| if (quickcheck & quickcheck_mask) | ||
| return 0; /* this string might need normalization */ | ||
| return MAYBE; /* this string might need normalization */ | ||
| if (combining && prev_combining > combining) | ||
| return 0; /* non-canonical sort order, not normalized */ | ||
| return NO; /* non-canonical sort order, not normalized */ | ||
| prev_combining = combining; | ||
| } | ||
| return 1; /* certainly normalized */ | ||
| return YES; /* certainly normalized */ | ||
| } | ||
|
|
||
| /*[clinic input] | ||
| unicodedata.UCD.is_normalized | ||
|
|
||
| self: self | ||
| form: unicode | ||
| unistr as input: unicode | ||
| / | ||
|
|
||
| Return whether the Unicode string unistr is in the normal form 'form'. | ||
|
|
||
| Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. | ||
| [clinic start generated code]*/ | ||
|
|
||
| static PyObject * | ||
| unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, | ||
| PyObject *input) | ||
| /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/ | ||
| { | ||
| if (PyUnicode_READY(input) == -1) { | ||
| return NULL; | ||
| } | ||
|
|
||
| if (PyUnicode_GET_LENGTH(input) == 0) { | ||
| /* special case empty input strings. */ | ||
| Py_RETURN_TRUE; | ||
| } | ||
|
|
||
| PyObject *result; | ||
| int nfc = 0; | ||
| int k = 0; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These could be
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is meant to conform to the existing implementation of |
||
| NormalMode m; | ||
|
|
||
| PyObject *cmp; | ||
| int match = 0; | ||
|
|
||
| if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { | ||
| nfc = 1; | ||
| } | ||
| else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { | ||
| nfc = 1; | ||
| k = 1; | ||
| } | ||
| else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { | ||
| /* matches default values for `nfc` and `k` */ | ||
| } | ||
| else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { | ||
| k = 1; | ||
| } | ||
| else { | ||
| PyErr_SetString(PyExc_ValueError, "invalid normalization form"); | ||
| return NULL; | ||
| } | ||
|
|
||
| m = is_normalized(self, input, nfc, k); | ||
|
|
||
| if (m == MAYBE) { | ||
| cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); | ||
| if (cmp == NULL) { | ||
| return NULL; | ||
| } | ||
| match = PyUnicode_Compare(input, cmp); | ||
| Py_DECREF(cmp); | ||
| result = (match == 0) ? Py_True : Py_False; | ||
| } | ||
| else { | ||
| result = (m == YES) ? Py_True : Py_False; | ||
| } | ||
|
|
||
| Py_INCREF(result); | ||
| return result; | ||
| } | ||
|
|
||
|
|
||
| /*[clinic input] | ||
| unicodedata.UCD.normalize | ||
|
|
||
| self: self | ||
| form: str | ||
| form: unicode | ||
| unistr as input: unicode | ||
| / | ||
|
|
||
|
|
@@ -821,9 +901,9 @@ Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. | |
| [clinic start generated code]*/ | ||
|
|
||
| static PyObject * | ||
| unicodedata_UCD_normalize_impl(PyObject *self, const char *form, | ||
| unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, | ||
| PyObject *input) | ||
| /*[clinic end generated code: output=62d1f8870027efdc input=1744c55f4ab79bf0]*/ | ||
| /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/ | ||
| { | ||
| if (PyUnicode_GET_LENGTH(input) == 0) { | ||
| /* Special case empty input strings, since resizing | ||
|
|
@@ -832,29 +912,29 @@ unicodedata_UCD_normalize_impl(PyObject *self, const char *form, | |
| return input; | ||
| } | ||
|
|
||
| if (strcmp(form, "NFC") == 0) { | ||
| if (is_normalized(self, input, 1, 0)) { | ||
| if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { | ||
| if (is_normalized(self, input, 1, 0) == YES) { | ||
| Py_INCREF(input); | ||
| return input; | ||
| } | ||
| return nfc_nfkc(self, input, 0); | ||
| } | ||
| if (strcmp(form, "NFKC") == 0) { | ||
| if (is_normalized(self, input, 1, 1)) { | ||
| if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { | ||
| if (is_normalized(self, input, 1, 1) == YES) { | ||
| Py_INCREF(input); | ||
| return input; | ||
| } | ||
| return nfc_nfkc(self, input, 1); | ||
| } | ||
| if (strcmp(form, "NFD") == 0) { | ||
| if (is_normalized(self, input, 0, 0)) { | ||
| if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { | ||
| if (is_normalized(self, input, 0, 0) == YES) { | ||
| Py_INCREF(input); | ||
| return input; | ||
| } | ||
| return nfd_nfkd(self, input, 0); | ||
| } | ||
| if (strcmp(form, "NFKD") == 0) { | ||
| if (is_normalized(self, input, 0, 1)) { | ||
| if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { | ||
| if (is_normalized(self, input, 0, 1) == YES) { | ||
| Py_INCREF(input); | ||
| return input; | ||
| } | ||
|
|
@@ -1271,6 +1351,7 @@ static PyMethodDef unicodedata_functions[] = { | |
| UNICODEDATA_UCD_DECOMPOSITION_METHODDEF | ||
| UNICODEDATA_UCD_NAME_METHODDEF | ||
| UNICODEDATA_UCD_LOOKUP_METHODDEF | ||
| UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF | ||
| UNICODEDATA_UCD_NORMALIZE_METHODDEF | ||
| {NULL, NULL} /* sentinel */ | ||
| }; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There should be some negative cases, too. Make sure the
MAYBEcase is being exercised.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Increased coverage + confirmed that this is exercising the
MAYBEpath.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe add also tests when it returns False. If the function always returns True, the test still pass ;-)