libidn  1.43
idna.c
Go to the documentation of this file.
1 /* idna.c --- Prototypes for Internationalized Domain Name library.
2  Copyright (C) 2002-2025 Simon Josefsson
3 
4  This file is part of GNU Libidn.
5 
6  GNU Libidn is free software: you can redistribute it and/or
7  modify it under the terms of either:
8 
9  * the GNU Lesser General Public License as published by the Free
10  Software Foundation; either version 3 of the License, or (at
11  your option) any later version.
12 
13  or
14 
15  * the GNU General Public License as published by the Free
16  Software Foundation; either version 2 of the License, or (at
17  your option) any later version.
18 
19  or both in parallel, as here.
20 
21  GNU Libidn is distributed in the hope that it will be useful,
22  but WITHOUT ANY WARRANTY; without even the implied warranty of
23  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24  General Public License for more details.
25 
26  You should have received copies of the GNU General Public License and
27  the GNU Lesser General Public License along with this program. If
28  not, see <https://www.gnu.org/licenses/>. */
29 
30 #ifdef HAVE_CONFIG_H
31 # include "config.h"
32 #endif
33 
34 #include <stdlib.h>
35 #include <string.h>
36 #include <stringprep.h>
37 #include <punycode.h>
38 
39 #include "idna.h"
40 
41 /* Get c_strcasecmp. */
42 #include <c-strcase.h>
43 
44 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
45  (c) == 0xFF0E || (c) == 0xFF61)
46 
47 /* Core functions */
48 
80 int
81 idna_to_ascii_4i (const uint32_t *in, size_t inlen, char *out, int flags)
82 {
83  size_t len, outlen;
84  uint32_t *src; /* XXX don't need to copy data? */
85  int rc;
86 
87  /*
88  * ToASCII consists of the following steps:
89  *
90  * 1. If all code points in the sequence are in the ASCII range (0..7F)
91  * then skip to step 3.
92  */
93 
94  {
95  size_t i;
96  int inasciirange;
97 
98  inasciirange = 1;
99  for (i = 0; i < inlen; i++)
100  if (in[i] > 0x7F)
101  inasciirange = 0;
102  if (inasciirange)
103  {
104  src = malloc (sizeof (in[0]) * (inlen + 1));
105  if (src == NULL)
106  return IDNA_MALLOC_ERROR;
107 
108  memcpy (src, in, sizeof (in[0]) * inlen);
109  src[inlen] = 0;
110 
111  goto step3;
112  }
113  }
114 
115  /*
116  * 2. Perform the steps specified in [NAMEPREP] and fail if there is
117  * an error. The AllowUnassigned flag is used in [NAMEPREP].
118  */
119 
120  {
121  char *p;
122 
123  p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
124  if (p == NULL)
125  return IDNA_MALLOC_ERROR;
126 
127  len = strlen (p);
128  do
129  {
130  char *newp;
131 
132  len = 2 * len + 10; /* XXX better guess? */
133  newp = realloc (p, len);
134  if (newp == NULL)
135  {
136  free (p);
137  return IDNA_MALLOC_ERROR;
138  }
139  p = newp;
140 
141  if (flags & IDNA_ALLOW_UNASSIGNED)
142  rc = stringprep_nameprep (p, len);
143  else
144  rc = stringprep_nameprep_no_unassigned (p, len);
145  }
146  while (rc == STRINGPREP_TOO_SMALL_BUFFER);
147 
148  if (rc != STRINGPREP_OK)
149  {
150  free (p);
151  return IDNA_STRINGPREP_ERROR;
152  }
153 
154  src = stringprep_utf8_to_ucs4 (p, -1, NULL);
155 
156  free (p);
157 
158  if (!src)
159  return IDNA_MALLOC_ERROR;
160  }
161 
162 step3:
163  /*
164  * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
165  *
166  * (a) Verify the absence of non-LDH ASCII code points; that is,
167  * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
168  *
169  * (b) Verify the absence of leading and trailing hyphen-minus;
170  * that is, the absence of U+002D at the beginning and end of
171  * the sequence.
172  */
173 
174  if (flags & IDNA_USE_STD3_ASCII_RULES)
175  {
176  size_t i;
177 
178  for (i = 0; src[i]; i++)
179  if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
180  (src[i] >= 0x3A && src[i] <= 0x40) ||
181  (src[i] >= 0x5B && src[i] <= 0x60) ||
182  (src[i] >= 0x7B && src[i] <= 0x7F))
183  {
184  free (src);
185  return IDNA_CONTAINS_NON_LDH;
186  }
187 
188  if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
189  {
190  free (src);
191  return IDNA_CONTAINS_MINUS;
192  }
193  }
194 
195  /*
196  * 4. If all code points in the sequence are in the ASCII range
197  * (0..7F), then skip to step 8.
198  */
199 
200  {
201  size_t i;
202  int inasciirange;
203 
204  inasciirange = 1;
205  for (i = 0; src[i]; i++)
206  {
207  if (src[i] > 0x7F)
208  inasciirange = 0;
209  /* copy string to output buffer if we are about to skip to step8 */
210  if (i < 64)
211  out[i] = src[i];
212  }
213  if (i < 64)
214  out[i] = '\0';
215  else
216  {
217  free (src);
218  return IDNA_INVALID_LENGTH;
219  }
220  if (inasciirange)
221  goto step8;
222  }
223 
224  /*
225  * 5. Verify that the sequence does NOT begin with the ACE prefix.
226  *
227  */
228 
229  {
230  size_t i;
231  int match;
232 
233  match = 1;
234  for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++)
235  if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
236  match = 0;
237  if (match)
238  {
239  free (src);
241  }
242  }
243 
244  /*
245  * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
246  * and fail if there is an error.
247  */
248  for (len = 0; src[len]; len++)
249  ;
250  src[len] = '\0';
251  outlen = 63 - strlen (IDNA_ACE_PREFIX);
252  rc = punycode_encode (len, src, NULL,
253  &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
254  if (rc != PUNYCODE_SUCCESS)
255  {
256  free (src);
257  return IDNA_PUNYCODE_ERROR;
258  }
259  out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
260 
261  /*
262  * 7. Prepend the ACE prefix.
263  */
264 
265  memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
266 
267  /*
268  * 8. Verify that the number of code points is in the range 1 to 63
269  * inclusive (0 is excluded).
270  */
271 
272 step8:
273  free (src);
274  if (strlen (out) < 1)
275  return IDNA_INVALID_LENGTH;
276 
277  return IDNA_SUCCESS;
278 }
279 
280 /* ToUnicode(). May realloc() utf8in. Will free utf8in unconditionally. */
281 static int
282 idna_to_unicode_internal (char *utf8in,
283  uint32_t *out, size_t *outlen, int flags)
284 {
285  int rc;
286  char tmpout[64];
287  size_t utf8len = strlen (utf8in) + 1;
288  size_t addlen = 0, addinc = utf8len / 10 + 1;
289 
290  /*
291  * ToUnicode consists of the following steps:
292  *
293  * 1. If the sequence contains any code points outside the ASCII range
294  * (0..7F) then proceed to step 2, otherwise skip to step 3.
295  */
296 
297  {
298  size_t i;
299  int inasciirange;
300 
301  inasciirange = 1;
302  for (i = 0; utf8in[i]; i++)
303  if (utf8in[i] & ~0x7F)
304  inasciirange = 0;
305  if (inasciirange)
306  goto step3;
307  }
308 
309  /*
310  * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
311  * error. (If step 3 of ToASCII is also performed here, it will not
312  * affect the overall behavior of ToUnicode, but it is not
313  * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
314  */
315  do
316  {
317  char *newp = realloc (utf8in, utf8len + addlen);
318  if (newp == NULL)
319  {
320  free (utf8in);
321  return IDNA_MALLOC_ERROR;
322  }
323  utf8in = newp;
324  if (flags & IDNA_ALLOW_UNASSIGNED)
325  rc = stringprep_nameprep (utf8in, utf8len + addlen);
326  else
327  rc = stringprep_nameprep_no_unassigned (utf8in, utf8len + addlen);
328  addlen += addinc;
329  addinc *= 2;
330  }
331  while (rc == STRINGPREP_TOO_SMALL_BUFFER);
332 
333  if (rc != STRINGPREP_OK)
334  {
335  free (utf8in);
336  return IDNA_STRINGPREP_ERROR;
337  }
338 
339  /* 3. Verify that the sequence begins with the ACE prefix, and save a
340  * copy of the sequence.
341  * ... The ToASCII and ToUnicode operations MUST recognize the ACE
342  prefix in a case-insensitive manner.
343  */
344 
345 step3:
346  if (c_strncasecmp (utf8in, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)) != 0)
347  {
348  free (utf8in);
349  return IDNA_NO_ACE_PREFIX;
350  }
351 
352  /* 4. Remove the ACE prefix.
353  */
354 
355  memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
356  strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
357 
358  /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
359  * and fail if there is an error. Save a copy of the result of
360  * this step.
361  */
362 
363  (*outlen)--; /* reserve one for the zero */
364 
365  rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL);
366  if (rc != PUNYCODE_SUCCESS)
367  {
368  free (utf8in);
369  return IDNA_PUNYCODE_ERROR;
370  }
371 
372  out[*outlen] = 0; /* add zero */
373 
374  /* 6. Apply ToASCII.
375  */
376 
377  rc = idna_to_ascii_4i (out, *outlen, tmpout, flags);
378  if (rc != IDNA_SUCCESS)
379  {
380  free (utf8in);
381  return rc;
382  }
383 
384  /* 7. Verify that the result of step 6 matches the saved copy from
385  * step 3, using a case-insensitive ASCII comparison.
386  */
387 
388  if (c_strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0)
389  {
390  free (utf8in);
392  }
393 
394  /* 8. Return the saved copy from step 5.
395  */
396 
397  free (utf8in);
398  return IDNA_SUCCESS;
399 }
400 
436 int
437 idna_to_unicode_44i (const uint32_t *in, size_t inlen,
438  uint32_t *out, size_t *outlen, int flags)
439 {
440  int rc;
441  size_t outlensave = *outlen;
442  char *p;
443 
444  p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
445  if (p == NULL)
446  return IDNA_MALLOC_ERROR;
447 
448  rc = idna_to_unicode_internal (p, out, outlen, flags);
449  if (rc != IDNA_SUCCESS)
450  {
451  memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ?
452  inlen : outlensave));
453  *outlen = inlen;
454  }
455 
456  /* p is freed in idna_to_unicode_internal. */
457 
458  return rc;
459 }
460 
461 /* Wrappers that handle several labels */
462 
476 int
477 idna_to_ascii_4z (const uint32_t *input, char **output, int flags)
478 {
479  const uint32_t *start = input;
480  const uint32_t *end;
481  char buf[64];
482  char *out = NULL;
483  int rc;
484 
485  /* 1) Whenever dots are used as label separators, the following
486  characters MUST be recognized as dots: U+002E (full stop),
487  U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
488  U+FF61 (halfwidth ideographic full stop). */
489 
490  if (input[0] == 0)
491  {
492  /* Handle implicit zero-length root label. */
493  *output = malloc (1);
494  if (!*output)
495  return IDNA_MALLOC_ERROR;
496  strcpy (*output, "");
497  return IDNA_SUCCESS;
498  }
499 
500  if (DOTP (input[0]) && input[1] == 0)
501  {
502  /* Handle explicit zero-length root label. */
503  *output = malloc (2);
504  if (!*output)
505  return IDNA_MALLOC_ERROR;
506  strcpy (*output, ".");
507  return IDNA_SUCCESS;
508  }
509 
510  *output = NULL;
511  do
512  {
513  end = start;
514 
515  for (; *end && !DOTP (*end); end++)
516  ;
517 
518  if (*end == '\0' && start == end)
519  {
520  /* Handle explicit zero-length root label. */
521  buf[0] = '\0';
522  }
523  else
524  {
525  rc = idna_to_ascii_4i (start, (size_t) (end - start), buf, flags);
526  if (rc != IDNA_SUCCESS)
527  {
528  free (out);
529  return rc;
530  }
531  }
532 
533  if (out)
534  {
535  size_t l = strlen (out) + 1 + strlen (buf) + 1;
536  char *newp = realloc (out, l);
537  if (!newp)
538  {
539  free (out);
540  return IDNA_MALLOC_ERROR;
541  }
542  out = newp;
543  strcat (out, ".");
544  strcat (out, buf);
545  }
546  else
547  {
548  out = strdup (buf);
549  if (!out)
550  return IDNA_MALLOC_ERROR;
551  }
552 
553  start = end + 1;
554  }
555  while (*end);
556 
557  *output = out;
558 
559  return IDNA_SUCCESS;
560 }
561 
575 int
576 idna_to_ascii_8z (const char *input, char **output, int flags)
577 {
578  uint32_t *ucs4;
579  size_t ucs4len;
580  int rc;
581 
582  ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
583  if (!ucs4)
584  return IDNA_ICONV_ERROR;
585 
586  rc = idna_to_ascii_4z (ucs4, output, flags);
587 
588  free (ucs4);
589 
590  return rc;
591 
592 }
593 
608 int
609 idna_to_ascii_lz (const char *input, char **output, int flags)
610 {
611  char *utf8;
612  int rc;
613 
614  utf8 = stringprep_locale_to_utf8 (input);
615  if (!utf8)
616  return IDNA_ICONV_ERROR;
617 
618  rc = idna_to_ascii_8z (utf8, output, flags);
619 
620  free (utf8);
621 
622  return rc;
623 }
624 
639 int
640 idna_to_unicode_4z4z (const uint32_t *input, uint32_t **output, int flags)
641 {
642  const uint32_t *start = input;
643  const uint32_t *end;
644  uint32_t *buf;
645  size_t buflen;
646  uint32_t *out = NULL;
647  size_t outlen = 0;
648  int rc;
649 
650  *output = NULL;
651 
652  do
653  {
654  end = start;
655 
656  for (; *end && !DOTP (*end); end++)
657  ;
658 
659  buflen = (size_t) (end - start);
660  buf = malloc (sizeof (buf[0]) * (buflen + 1));
661  if (!buf)
662  {
663  free (out);
664  return IDNA_MALLOC_ERROR;
665  }
666 
667  /* don't check for non-malloc return codes as per
668  specification! */
669  rc = idna_to_unicode_44i (start, (size_t) (end - start),
670  buf, &buflen, flags);
671  if (rc == IDNA_MALLOC_ERROR)
672  {
673  free (out);
674  return IDNA_MALLOC_ERROR;
675  }
676 
677  if (out)
678  {
679  uint32_t *newp = realloc (out,
680  sizeof (out[0])
681  * (outlen + 1 + buflen + 1));
682  if (!newp)
683  {
684  free (buf);
685  free (out);
686  return IDNA_MALLOC_ERROR;
687  }
688  out = newp;
689  out[outlen++] = 0x002E; /* '.' (full stop) */
690  memcpy (out + outlen, buf, sizeof (buf[0]) * buflen);
691  outlen += buflen;
692  out[outlen] = 0x0;
693  free (buf);
694  }
695  else
696  {
697  out = buf;
698  outlen = buflen;
699  out[outlen] = 0x0;
700  }
701 
702  start = end + 1;
703  }
704  while (*end);
705 
706  *output = out;
707 
708  return IDNA_SUCCESS;
709 }
710 
725 int
726 idna_to_unicode_8z4z (const char *input, uint32_t **output, int flags)
727 {
728  uint32_t *ucs4;
729  size_t ucs4len;
730  int rc;
731 
732  ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
733  if (!ucs4)
734  return IDNA_ICONV_ERROR;
735 
736  rc = idna_to_unicode_4z4z (ucs4, output, flags);
737  free (ucs4);
738 
739  return rc;
740 }
741 
756 int
757 idna_to_unicode_8z8z (const char *input, char **output, int flags)
758 {
759  uint32_t *ucs4;
760  int rc;
761 
762  rc = idna_to_unicode_8z4z (input, &ucs4, flags);
763  if (rc != IDNA_SUCCESS)
764  return rc;
765 
766  *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL);
767  free (ucs4);
768 
769  if (!*output)
770  return IDNA_ICONV_ERROR;
771 
772  return IDNA_SUCCESS;
773 }
774 
790 int
791 idna_to_unicode_8zlz (const char *input, char **output, int flags)
792 {
793  char *utf8;
794  int rc;
795 
796  rc = idna_to_unicode_8z8z (input, &utf8, flags);
797  if (rc != IDNA_SUCCESS)
798  return rc;
799 
800  *output = stringprep_utf8_to_locale (utf8);
801  free (utf8);
802 
803  if (!*output)
804  return IDNA_ICONV_ERROR;
805 
806  return IDNA_SUCCESS;
807 }
808 
825 int
826 idna_to_unicode_lzlz (const char *input, char **output, int flags)
827 {
828  char *utf8;
829  int rc;
830 
831  utf8 = stringprep_locale_to_utf8 (input);
832  if (!utf8)
833  return IDNA_ICONV_ERROR;
834 
835  rc = idna_to_unicode_8zlz (utf8, output, flags);
836  free (utf8);
837 
838  return rc;
839 }
840 
int idna_to_unicode_8zlz(const char *input, char **output, int flags)
Definition: idna.c:791
#define DOTP(c)
Definition: idna.c:44
int idna_to_unicode_4z4z(const uint32_t *input, uint32_t **output, int flags)
Definition: idna.c:640
int idna_to_ascii_8z(const char *input, char **output, int flags)
Definition: idna.c:576
int idna_to_ascii_4z(const uint32_t *input, char **output, int flags)
Definition: idna.c:477
int idna_to_unicode_lzlz(const char *input, char **output, int flags)
Definition: idna.c:826
int idna_to_unicode_8z4z(const char *input, uint32_t **output, int flags)
Definition: idna.c:726
int idna_to_unicode_44i(const uint32_t *in, size_t inlen, uint32_t *out, size_t *outlen, int flags)
Definition: idna.c:437
int idna_to_unicode_8z8z(const char *input, char **output, int flags)
Definition: idna.c:757
int idna_to_ascii_4i(const uint32_t *in, size_t inlen, char *out, int flags)
Definition: idna.c:81
int idna_to_ascii_lz(const char *input, char **output, int flags)
Definition: idna.c:609
@ IDNA_ROUNDTRIP_VERIFY_ERROR
Definition: idna.h:83
@ IDNA_PUNYCODE_ERROR
Definition: idna.h:76
@ IDNA_SUCCESS
Definition: idna.h:74
@ IDNA_NO_ACE_PREFIX
Definition: idna.h:82
@ IDNA_CONTAINS_MINUS
Definition: idna.h:80
@ IDNA_ICONV_ERROR
Definition: idna.h:85
@ IDNA_STRINGPREP_ERROR
Definition: idna.h:75
@ IDNA_CONTAINS_ACE_PREFIX
Definition: idna.h:84
@ IDNA_CONTAINS_NON_LDH
Definition: idna.h:77
@ IDNA_INVALID_LENGTH
Definition: idna.h:81
@ IDNA_MALLOC_ERROR
Definition: idna.h:87
@ IDNA_USE_STD3_ASCII_RULES
Definition: idna.h:95
@ IDNA_ALLOW_UNASSIGNED
Definition: idna.h:94
#define IDNA_ACE_PREFIX
Definition: idna.h:99
char * stringprep_ucs4_to_utf8(const uint32_t *str, ssize_t len, size_t *items_read, size_t *items_written)
Definition: nfkc.c:1019
uint32_t * stringprep_utf8_to_ucs4(const char *str, ssize_t len, size_t *items_written)
Definition: nfkc.c:986
int punycode_decode(size_t input_length, const char input[], size_t *output_length, punycode_uint output[], unsigned char case_flags[])
Definition: punycode.c:348
int punycode_encode(size_t input_length, const punycode_uint input[], const unsigned char case_flags[], size_t *output_length, char output[])
Definition: punycode.c:196
@ PUNYCODE_SUCCESS
Definition: punycode.h:110
IDNAPI char * stringprep_locale_to_utf8(const char *str)
Definition: toutf8.c:145
@ STRINGPREP_TOO_SMALL_BUFFER
Definition: stringprep.h:75
@ STRINGPREP_OK
Definition: stringprep.h:67
#define stringprep_nameprep(in, maxlen)
Definition: stringprep.h:202
IDNAPI char * stringprep_utf8_to_locale(const char *str)
Definition: toutf8.c:161
#define stringprep_nameprep_no_unassigned(in, maxlen)
Definition: stringprep.h:205