Skip to content
7 changes: 7 additions & 0 deletions Doc/library/unicodedata.rst
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,13 @@ following functions:
a human reader, if one has combining characters and the other
doesn't, they may not compare equal.

.. function:: is_normalized(form, unistr)

Return whether the Unicode string *unistr* is in the normal form *form*. Valid
values for *form* are 'NFC', 'NFKC', 'NFD', and 'NFKD'.

.. versionadded:: 3.8


In addition, the module exposes the following constant:

Expand Down
7 changes: 7 additions & 0 deletions Doc/whatsnew/3.8.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,13 @@ Added method :meth:`~tkinter.Canvas.moveto`
in the :class:`tkinter.Canvas` class.
(Contributed by Juliette Monsel in :issue:`23831`.)

unicodedata
-----------

* New function :func:`~unicodedata.is_normalized` can be used to verify a string
is in a specific normal form. (Contributed by Max Belanger and David Euresti in
:issue:`32285`).

venv
----

Expand Down
11 changes: 10 additions & 1 deletion Lib/test/test_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from http.client import HTTPException
import sys
from unicodedata import normalize, unidata_version
from unicodedata import normalize, is_normalized, unidata_version

TESTDATAFILE = "NormalizationTest.txt"
TESTDATAURL = "http://www.pythontest.net/unicode/" + unidata_version + "/" + TESTDATAFILE
Expand Down Expand Up @@ -88,6 +88,15 @@ def run_normalization_tests(self, testdata):
NFKD(c3) == NFKD(c4) == NFKD(c5),
line)

self.assertTrue(is_normalized("NFC", c2))
self.assertTrue(is_normalized("NFC", c4))

self.assertTrue(is_normalized("NFD", c3))
self.assertTrue(is_normalized("NFD", c5))

self.assertTrue(is_normalized("NFKC", c4))
self.assertTrue(is_normalized("NFKD", c5))
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be some negative cases, too. Make sure the MAYBE case is being exercised.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Increased coverage + confirmed that this is exercising the MAYBE path.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add also tests when it returns False. If the function always returns True, the test still pass ;-)


# Record part 1 data
if part == "@Part1":
part1_data[c1] = 1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
New function unicodedata.is_normalized, which can check whether a string is
in a specific normal form.
40 changes: 36 additions & 4 deletions Modules/clinic/unicodedata.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

115 changes: 98 additions & 17 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
#include "ucnhash.h"
#include "structmember.h"

_Py_IDENTIFIER(NFC);
_Py_IDENTIFIER(NFD);
_Py_IDENTIFIER(NFKC);
_Py_IDENTIFIER(NFKD);

/*[clinic input]
module unicodedata
class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
Expand Down Expand Up @@ -770,8 +775,10 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
return result;
}

/* Return 1 if the input is certainly normalized, 0 if it might not be. */
static int
typedef enum {YES, NO, MAYBE} NormalMode;

/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
static NormalMode
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
{
Py_ssize_t i, len;
Expand All @@ -782,7 +789,7 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
/* An older version of the database is requested, quickchecks must be
disabled. */
if (self && UCD_Check(self))
return 0;
return NO;

/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
as described in http://unicode.org/reports/tr15/#Annex8. */
Expand All @@ -799,19 +806,92 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
unsigned char quickcheck = record->normalization_quick_check;

if (quickcheck & quickcheck_mask)
return 0; /* this string might need normalization */
return MAYBE; /* this string might need normalization */
if (combining && prev_combining > combining)
return 0; /* non-canonical sort order, not normalized */
return NO; /* non-canonical sort order, not normalized */
prev_combining = combining;
}
return 1; /* certainly normalized */
return YES; /* certainly normalized */
}

/*[clinic input]
unicodedata.UCD.is_normalized

self: self
form: unicode
unistr as input: unicode
/

Return whether the Unicode string unistr is in the normal form 'form'.

Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
[clinic start generated code]*/

static PyObject *
unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
PyObject *input)
/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
{
if (PyUnicode_READY(input) == -1) {
return NULL;
}

if (PyUnicode_GET_LENGTH(input) == 0) {
/* special case empty input strings. */
Py_RETURN_TRUE;
}

PyObject *result;
int nfc = 0;
int k = 0;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These could be bool.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is meant to conform to the existing implementation of is_normalized, which takes in ints. Could change is_normalized, but I preferred to avoid making changes outside the scope of my own.

NormalMode m;

PyObject *cmp;
int match = 0;

if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
nfc = 1;
}
else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
nfc = 1;
k = 1;
}
else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
/* matches default values for `nfc` and `k` */
}
else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
k = 1;
}
else {
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
return NULL;
}

m = is_normalized(self, input, nfc, k);

if (m == MAYBE) {
cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
if (cmp == NULL) {
return NULL;
}
match = PyUnicode_Compare(input, cmp);
Py_DECREF(cmp);
result = (match == 0) ? Py_True : Py_False;
}
else {
result = (m == YES) ? Py_True : Py_False;
}

Py_INCREF(result);
return result;
}


/*[clinic input]
unicodedata.UCD.normalize

self: self
form: str
form: unicode
unistr as input: unicode
/

Expand All @@ -821,9 +901,9 @@ Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
[clinic start generated code]*/

static PyObject *
unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
PyObject *input)
/*[clinic end generated code: output=62d1f8870027efdc input=1744c55f4ab79bf0]*/
/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
{
if (PyUnicode_GET_LENGTH(input) == 0) {
/* Special case empty input strings, since resizing
Expand All @@ -832,29 +912,29 @@ unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
return input;
}

if (strcmp(form, "NFC") == 0) {
if (is_normalized(self, input, 1, 0)) {
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
if (is_normalized(self, input, 1, 0) == YES) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 0);
}
if (strcmp(form, "NFKC") == 0) {
if (is_normalized(self, input, 1, 1)) {
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
if (is_normalized(self, input, 1, 1) == YES) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 1);
}
if (strcmp(form, "NFD") == 0) {
if (is_normalized(self, input, 0, 0)) {
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
if (is_normalized(self, input, 0, 0) == YES) {
Py_INCREF(input);
return input;
}
return nfd_nfkd(self, input, 0);
}
if (strcmp(form, "NFKD") == 0) {
if (is_normalized(self, input, 0, 1)) {
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
if (is_normalized(self, input, 0, 1) == YES) {
Py_INCREF(input);
return input;
}
Expand Down Expand Up @@ -1271,6 +1351,7 @@ static PyMethodDef unicodedata_functions[] = {
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
UNICODEDATA_UCD_NAME_METHODDEF
UNICODEDATA_UCD_LOOKUP_METHODDEF
UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
UNICODEDATA_UCD_NORMALIZE_METHODDEF
{NULL, NULL} /* sentinel */
};
Expand Down