Blame - compat/regex/regcomp.c - jrn/git

blob: 8c96ed942c3f7139a4b161c75c6c81e42530c085 [file] [log] [blame]

Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	1	/* Extended regular expression matching and search library.
				2	Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
				3	This file is part of the GNU C Library.
				4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
				5
				6	The GNU C Library is free software; you can redistribute it and/or
				7	modify it under the terms of the GNU Lesser General Public
				8	License as published by the Free Software Foundation; either
				9	version 2.1 of the License, or (at your option) any later version.
				10
				11	The GNU C Library is distributed in the hope that it will be useful,
				12	but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				14	Lesser General Public License for more details.
				15
				16	You should have received a copy of the GNU Lesser General Public
				17	License along with the GNU C Library; if not, write to the Free
				18	Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
				19	02110-1301 USA. */
				20
				21	static reg_errcode_t re_compile_internal (regex_t preg, const char pattern,
				22	size_t length, reg_syntax_t syntax);
				23	static void re_compile_fastmap_iter (regex_t *bufp,
				24	const re_dfastate_t *init_state,
				25	char *fastmap);
				26	static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
				27	#ifdef RE_ENABLE_I18N
				28	static void free_charset (re_charset_t *cset);
				29	#endif /* RE_ENABLE_I18N */
				30	static void free_workarea_compile (regex_t *preg);
				31	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
				32	#ifdef RE_ENABLE_I18N
				33	static void optimize_utf8 (re_dfa_t *dfa);
				34	#endif
				35	static reg_errcode_t analyze (regex_t *preg);
				36	static reg_errcode_t preorder (bin_tree_t *root,
				37	reg_errcode_t (fn (void , bin_tree_t )),
				38	void *extra);
				39	static reg_errcode_t postorder (bin_tree_t *root,
				40	reg_errcode_t (fn (void , bin_tree_t )),
				41	void *extra);
				42	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
				43	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
				44	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
				45	bin_tree_t *node);
				46	static reg_errcode_t calc_first (void extra, bin_tree_t node);
				47	static reg_errcode_t calc_next (void extra, bin_tree_t node);
				48	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
				49	static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
				50	static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
				51	unsigned int constraint);
				52	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
				53	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
				54	int node, int root);
				55	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
				56	static int fetch_number (re_string_t input, re_token_t token,
				57	reg_syntax_t syntax);
				58	static int peek_token (re_token_t token, re_string_t input,
				59	reg_syntax_t syntax) internal_function;
				60	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
				61	reg_syntax_t syntax, reg_errcode_t *err);
				62	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
				63	re_token_t *token, reg_syntax_t syntax,
				64	int nest, reg_errcode_t *err);
				65	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
				66	re_token_t *token, reg_syntax_t syntax,
				67	int nest, reg_errcode_t *err);
				68	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
				69	re_token_t *token, reg_syntax_t syntax,
				70	int nest, reg_errcode_t *err);
				71	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
				72	re_token_t *token, reg_syntax_t syntax,
				73	int nest, reg_errcode_t *err);
				74	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
				75	re_dfa_t dfa, re_token_t token,
				76	reg_syntax_t syntax, reg_errcode_t *err);
				77	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
				78	re_token_t *token, reg_syntax_t syntax,
				79	reg_errcode_t *err);
				80	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
				81	re_string_t *regexp,
				82	re_token_t *token, int token_len,
				83	re_dfa_t *dfa,
				84	reg_syntax_t syntax,
				85	int accept_hyphen);
				86	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
				87	re_string_t *regexp,
				88	re_token_t *token);
				89	#ifdef RE_ENABLE_I18N
				90	static reg_errcode_t build_equiv_class (bitset_t sbcset,
				91	re_charset_t *mbcset,
				92	int *equiv_class_alloc,
				93	const unsigned char *name);
				94	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
				95	bitset_t sbcset,
				96	re_charset_t *mbcset,
				97	int *char_class_alloc,
				98	const char *class_name,
				99	reg_syntax_t syntax);
				100	#else /* not RE_ENABLE_I18N */
				101	static reg_errcode_t build_equiv_class (bitset_t sbcset,
				102	const unsigned char *name);
				103	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
				104	bitset_t sbcset,
				105	const char *class_name,
				106	reg_syntax_t syntax);
				107	#endif /* not RE_ENABLE_I18N */
				108	static bin_tree_t build_charclass_op (re_dfa_t dfa,
				109	RE_TRANSLATE_TYPE trans,
				110	const char *class_name,
				111	const char *extra,
				112	int non_match, reg_errcode_t *err);
				113	static bin_tree_t create_tree (re_dfa_t dfa,
				114	bin_tree_t left, bin_tree_t right,
				115	re_token_type_t type);
				116	static bin_tree_t create_token_tree (re_dfa_t dfa,
				117	bin_tree_t left, bin_tree_t right,
				118	const re_token_t *token);
				119	static bin_tree_t duplicate_tree (const bin_tree_t src, re_dfa_t *dfa);
				120	static void free_token (re_token_t *node);
				121	static reg_errcode_t free_tree (void extra, bin_tree_t node);
				122	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
				123
				124	/* This table gives an error message for each of the error codes listed
				125	in regex.h. Obviously the order here has to be same as there.
				126	POSIX doesn't require that we do anything for REG_NOERROR,
				127	but why not be nice? */
				128
				129	const char __re_error_msgid[] attribute_hidden =
				130	{
				131	#define REG_NOERROR_IDX 0
				132	gettext_noop ("Success") /* REG_NOERROR */
				133	"\0"
				134	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
				135	gettext_noop ("No match") /* REG_NOMATCH */
				136	"\0"
				137	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
				138	gettext_noop ("Invalid regular expression") /* REG_BADPAT */
				139	"\0"
				140	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
				141	gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
				142	"\0"
				143	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
				144	gettext_noop ("Invalid character class name") /* REG_ECTYPE */
				145	"\0"
				146	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
				147	gettext_noop ("Trailing backslash") /* REG_EESCAPE */
				148	"\0"
				149	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
				150	gettext_noop ("Invalid back reference") /* REG_ESUBREG */
				151	"\0"
				152	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
				153	gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
				154	"\0"
				155	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
				156	gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
				157	"\0"
				158	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
				159	gettext_noop ("Unmatched \\{") /* REG_EBRACE */
				160	"\0"
				161	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
				162	gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
				163	"\0"
				164	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
				165	gettext_noop ("Invalid range end") /* REG_ERANGE */
				166	"\0"
				167	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
				168	gettext_noop ("Memory exhausted") /* REG_ESPACE */
				169	"\0"
				170	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
				171	gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
				172	"\0"
				173	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
				174	gettext_noop ("Premature end of regular expression") /* REG_EEND */
				175	"\0"
				176	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
				177	gettext_noop ("Regular expression too big") /* REG_ESIZE */
				178	"\0"
				179	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
				180	gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
				181	};
				182
				183	const size_t __re_error_msgid_idx[] attribute_hidden =
				184	{
				185	REG_NOERROR_IDX,
				186	REG_NOMATCH_IDX,
				187	REG_BADPAT_IDX,
				188	REG_ECOLLATE_IDX,
				189	REG_ECTYPE_IDX,
				190	REG_EESCAPE_IDX,
				191	REG_ESUBREG_IDX,
				192	REG_EBRACK_IDX,
				193	REG_EPAREN_IDX,
				194	REG_EBRACE_IDX,
				195	REG_BADBR_IDX,
				196	REG_ERANGE_IDX,
				197	REG_ESPACE_IDX,
				198	REG_BADRPT_IDX,
				199	REG_EEND_IDX,
				200	REG_ESIZE_IDX,
				201	REG_ERPAREN_IDX
				202	};
				203
				204	/* Entry points for GNU code. */
				205
				206
				207	#ifdef ZOS_USS
				208
				209	/* For ZOS USS we must define btowc */
				210
				211	wchar_t
				212	btowc (int c)
				213	{
				214	wchar_t wtmp[2];
				215	char tmp[2];
				216
				217	tmp[0] = c;
				218	tmp[1] = 0;
				219
				220	mbtowc (wtmp, tmp, 1);
				221	return wtmp[0];
				222	}
				223	#endif
				224
				225	/* re_compile_pattern is the GNU regular expression compiler: it
				226	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
				227	Returns 0 if the pattern was valid, otherwise an error string.
				228
				229	Assumes the `allocated' (and perhaps `buffer') and `translate' fields
				230	are set in BUFP on entry. */
				231
				232	const char *
Junio C Hamano	178b331	2010-08-19 15:31:30 -0700	[diff] [blame^]	233	re_compile_pattern (const char *pattern,
				234	size_t length,
				235	struct re_pattern_buffer *bufp)
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	236	{
				237	reg_errcode_t ret;
				238
				239	/* And GNU code determines whether or not to get register information
				240	by passing null for the REGS argument to re_match, etc., not by
				241	setting no_sub, unless RE_NO_SUB is set. */
				242	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
				243
				244	/* Match anchors at newline. */
				245	bufp->newline_anchor = 1;
				246
				247	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
				248
				249	if (!ret)
				250	return NULL;
				251	return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
				252	}
				253	#ifdef _LIBC
				254	weak_alias (__re_compile_pattern, re_compile_pattern)
				255	#endif
				256
				257	/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
				258	also be assigned to arbitrarily: each pattern buffer stores its own
				259	syntax, so it can be changed between regex compilations. */
				260	/* This has no initializer because initialized variables in Emacs
				261	become read-only after dumping. */
				262	reg_syntax_t re_syntax_options;
				263
				264
				265	/* Specify the precise syntax of regexps for compilation. This provides
				266	for compatibility for various utilities which historically have
				267	different, incompatible syntaxes.
				268
				269	The argument SYNTAX is a bit mask comprised of the various bits
				270	defined in regex.h. We return the old syntax. */
				271
				272	reg_syntax_t
Junio C Hamano	178b331	2010-08-19 15:31:30 -0700	[diff] [blame^]	273	re_set_syntax (reg_syntax_t syntax)
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	274	{
				275	reg_syntax_t ret = re_syntax_options;
				276
				277	re_syntax_options = syntax;
				278	return ret;
				279	}
				280	#ifdef _LIBC
				281	weak_alias (__re_set_syntax, re_set_syntax)
				282	#endif
				283
				284	int
Junio C Hamano	178b331	2010-08-19 15:31:30 -0700	[diff] [blame^]	285	re_compile_fastmap (struct re_pattern_buffer *bufp)
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	286	{
				287	re_dfa_t dfa = (re_dfa_t ) bufp->buffer;
				288	char *fastmap = bufp->fastmap;
				289
				290	memset (fastmap, '\0', sizeof (char) * SBC_MAX);
				291	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
				292	if (dfa->init_state != dfa->init_state_word)
				293	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
				294	if (dfa->init_state != dfa->init_state_nl)
				295	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
				296	if (dfa->init_state != dfa->init_state_begbuf)
				297	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
				298	bufp->fastmap_accurate = 1;
				299	return 0;
				300	}
				301	#ifdef _LIBC
				302	weak_alias (__re_compile_fastmap, re_compile_fastmap)
				303	#endif
				304
				305	static inline void
				306	__attribute ((always_inline))
				307	re_set_fastmap (char *fastmap, int icase, int ch)
				308	{
				309	fastmap[ch] = 1;
				310	if (icase)
				311	fastmap[tolower (ch)] = 1;
				312	}
				313
				314	/* Helper function for re_compile_fastmap.
				315	Compile fastmap for the initial_state INIT_STATE. */
				316
				317	static void
				318	re_compile_fastmap_iter (regex_t bufp, const re_dfastate_t init_state,
				319	char *fastmap)
				320	{
				321	volatile re_dfa_t dfa = (re_dfa_t ) bufp->buffer;
				322	int node_cnt;
				323	int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
				324	for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
				325	{
				326	int node = init_state->nodes.elems[node_cnt];
				327	re_token_type_t type = dfa->nodes[node].type;
				328
				329	if (type == CHARACTER)
				330	{
				331	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
				332	#ifdef RE_ENABLE_I18N
				333	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
				334	{
				335	unsigned char buf = re_malloc (unsigned char, dfa->mb_cur_max), p;
				336	wchar_t wc;
				337	mbstate_t state;
				338
				339	p = buf;
				340	*p++ = dfa->nodes[node].opr.c;
				341	while (++node < dfa->nodes_len
				342	&& dfa->nodes[node].type == CHARACTER
				343	&& dfa->nodes[node].mb_partial)
				344	*p++ = dfa->nodes[node].opr.c;
				345	memset (&state, '\0', sizeof (state));
				346	if (__mbrtowc (&wc, (const char *) buf, p - buf,
				347	&state) == p - buf
				348	&& (__wcrtomb ((char *) buf, towlower (wc), &state)
				349	!= (size_t) -1))
				350	re_set_fastmap (fastmap, 0, buf[0]);
				351	re_free (buf);
				352	}
				353	#endif
				354	}
				355	else if (type == SIMPLE_BRACKET)
				356	{
				357	int i, ch;
				358	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
				359	{
				360	int j;
				361	bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
				362	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
				363	if (w & ((bitset_word_t) 1 << j))
				364	re_set_fastmap (fastmap, icase, ch);
				365	}
				366	}
				367	#ifdef RE_ENABLE_I18N
				368	else if (type == COMPLEX_BRACKET)
				369	{
				370	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
				371	int i;
				372
				373	# ifdef _LIBC
				374	/* See if we have to try all bytes which start multiple collation
				375	elements.
				376	e.g. In da_DK, we want to catch 'a' since "aa" is a valid
				377	collation element, and don't catch 'b' since 'b' is
				378	the only collation element which starts from 'b' (and
				379	it is caught by SIMPLE_BRACKET). */
				380	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
				381	&& (cset->ncoll_syms \|\| cset->nranges))
				382	{
				383	const int32_t table = (const int32_t )
				384	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
				385	for (i = 0; i < SBC_MAX; ++i)
				386	if (table[i] < 0)
				387	re_set_fastmap (fastmap, icase, i);
				388	}
				389	# endif /* _LIBC */
				390
				391	/* See if we have to start the match at all multibyte characters,
				392	i.e. where we would not find an invalid sequence. This only
				393	applies to multibyte character sets; for single byte character
				394	sets, the SIMPLE_BRACKET again suffices. */
				395	if (dfa->mb_cur_max > 1
				396	&& (cset->nchar_classes \|\| cset->non_match \|\| cset->nranges
				397	# ifdef _LIBC
				398	\|\| cset->nequiv_classes
				399	# endif /* _LIBC */
				400	))
				401	{
				402	unsigned char c = 0;
				403	do
				404	{
				405	mbstate_t mbs;
				406	memset (&mbs, 0, sizeof (mbs));
				407	if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
				408	re_set_fastmap (fastmap, false, (int) c);
				409	}
				410	while (++c != 0);
				411	}
				412
				413	else
				414	{
				415	/* ... Else catch all bytes which can start the mbchars. */
				416	for (i = 0; i < cset->nmbchars; ++i)
				417	{
				418	char buf[256];
				419	mbstate_t state;
				420	memset (&state, '\0', sizeof (state));
				421	if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
				422	re_set_fastmap (fastmap, icase, (unsigned char ) buf);
				423	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
				424	{
				425	if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
				426	!= (size_t) -1)
				427	re_set_fastmap (fastmap, false, (unsigned char ) buf);
				428	}
				429	}
				430	}
				431	}
				432	#endif /* RE_ENABLE_I18N */
				433	else if (type == OP_PERIOD
				434	#ifdef RE_ENABLE_I18N
				435	\|\| type == OP_UTF8_PERIOD
				436	#endif /* RE_ENABLE_I18N */
				437	\|\| type == END_OF_RE)
				438	{
				439	memset (fastmap, '\1', sizeof (char) * SBC_MAX);
				440	if (type == END_OF_RE)
				441	bufp->can_be_null = 1;
				442	return;
				443	}
				444	}
				445	}
				446
				447	/* Entry point for POSIX code. */
				448	/* regcomp takes a regular expression as a string and compiles it.
				449
				450	PREG is a regex_t *. We do not expect any fields to be initialized,
				451	since POSIX says we shouldn't. Thus, we set
				452
				453	`buffer' to the compiled pattern;
				454	`used' to the length of the compiled pattern;
				455	`syntax' to RE_SYNTAX_POSIX_EXTENDED if the
				456	REG_EXTENDED bit in CFLAGS is set; otherwise, to
				457	RE_SYNTAX_POSIX_BASIC;
				458	`newline_anchor' to REG_NEWLINE being set in CFLAGS;
				459	`fastmap' to an allocated space for the fastmap;
				460	`fastmap_accurate' to zero;
				461	`re_nsub' to the number of subexpressions in PATTERN.
				462
				463	PATTERN is the address of the pattern string.
				464
				465	CFLAGS is a series of bits which affect compilation.
				466
				467	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
				468	use POSIX basic syntax.
				469
				470	If REG_NEWLINE is set, then . and [^...] don't match newline.
				471	Also, regexec will try a match beginning after every newline.
				472
				473	If REG_ICASE is set, then we considers upper- and lowercase
				474	versions of letters to be equivalent when matching.
				475
				476	If REG_NOSUB is set, then when PREG is passed to regexec, that
				477	routine will report only success or failure, and nothing about the
				478	registers.
				479
				480	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
				481	the return codes and their meanings.) */
				482
				483	int
Junio C Hamano	178b331	2010-08-19 15:31:30 -0700	[diff] [blame^]	484	regcomp (regex_t *__restrict preg,
				485	const char *__restrict pattern,
				486	int cflags)
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	487	{
				488	reg_errcode_t ret;
				489	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
				490	: RE_SYNTAX_POSIX_BASIC);
				491
				492	preg->buffer = NULL;
				493	preg->allocated = 0;
				494	preg->used = 0;
				495
				496	/* Try to allocate space for the fastmap. */
				497	preg->fastmap = re_malloc (char, SBC_MAX);
				498	if (BE (preg->fastmap == NULL, 0))
				499	return REG_ESPACE;
				500
				501	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : 0;
				502
				503	/* If REG_NEWLINE is set, newlines are treated differently. */
				504	if (cflags & REG_NEWLINE)
				505	{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
				506	syntax &= ~RE_DOT_NEWLINE;
				507	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
				508	/* It also changes the matching behavior. */
				509	preg->newline_anchor = 1;
				510	}
				511	else
				512	preg->newline_anchor = 0;
				513	preg->no_sub = !!(cflags & REG_NOSUB);
				514	preg->translate = NULL;
				515
				516	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
				517
				518	/* POSIX doesn't distinguish between an unmatched open-group and an
				519	unmatched close-group: both are REG_EPAREN. */
				520	if (ret == REG_ERPAREN)
				521	ret = REG_EPAREN;
				522
				523	/* We have already checked preg->fastmap != NULL. */
				524	if (BE (ret == REG_NOERROR, 1))
				525	/* Compute the fastmap now, since regexec cannot modify the pattern
				526	buffer. This function never fails in this implementation. */
				527	(void) re_compile_fastmap (preg);
				528	else
				529	{
				530	/* Some error occurred while compiling the expression. */
				531	re_free (preg->fastmap);
				532	preg->fastmap = NULL;
				533	}
				534
				535	return (int) ret;
				536	}
				537	#ifdef _LIBC
				538	weak_alias (__regcomp, regcomp)
				539	#endif
				540
				541	/* Returns a message corresponding to an error code, ERRCODE, returned
				542	from either regcomp or regexec. We don't use PREG here. */
				543
				544	size_t
Frank Li	de83172	2010-08-17 09:24:40 +0000	[diff] [blame]	545	regerror(int errcode, const regex_t *__restrict preg,
				546	char *__restrict errbuf, size_t errbuf_size)
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	547	{
				548	const char *msg;
				549	size_t msg_size;
				550
				551	if (BE (errcode < 0
				552	\|\| errcode >= (int) (sizeof (__re_error_msgid_idx)
				553	/ sizeof (__re_error_msgid_idx[0])), 0))
				554	/* Only error codes returned by the rest of the code should be passed
				555	to this routine. If we are given anything else, or if other regex
				556	code generates an invalid error code, then the program has a bug.
				557	Dump core so we can fix it. */
				558	abort ();
				559
				560	msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
				561
				562	msg_size = strlen (msg) + 1; /* Includes the null. */
				563
				564	if (BE (errbuf_size != 0, 1))
				565	{
				566	if (BE (msg_size > errbuf_size, 0))
				567	{
				568	memcpy (errbuf, msg, errbuf_size - 1);
				569	errbuf[errbuf_size - 1] = 0;
				570	}
				571	else
				572	memcpy (errbuf, msg, msg_size);
				573	}
				574
				575	return msg_size;
				576	}
				577	#ifdef _LIBC
				578	weak_alias (__regerror, regerror)
				579	#endif
				580
				581
				582	#ifdef RE_ENABLE_I18N
				583	/* This static array is used for the map to single-byte characters when
				584	UTF-8 is used. Otherwise we would allocate memory just to initialize
				585	it the same all the time. UTF-8 is the preferred encoding so this is
				586	a worthwhile optimization. */
				587	#if __GNUC__ >= 3
				588	static const bitset_t utf8_sb_map = {
				589	/* Set the first 128 bits. */
				590	[0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
				591	};
				592	#else /* ! (__GNUC__ >= 3) */
				593	static bitset_t utf8_sb_map;
				594	#endif /* __GNUC__ >= 3 */
				595	#endif /* RE_ENABLE_I18N */
				596
				597
				598	static void
				599	free_dfa_content (re_dfa_t *dfa)
				600	{
				601	int i, j;
				602
				603	if (dfa->nodes)
				604	for (i = 0; i < dfa->nodes_len; ++i)
				605	free_token (dfa->nodes + i);
				606	re_free (dfa->nexts);
				607	for (i = 0; i < dfa->nodes_len; ++i)
				608	{
				609	if (dfa->eclosures != NULL)
				610	re_node_set_free (dfa->eclosures + i);
				611	if (dfa->inveclosures != NULL)
				612	re_node_set_free (dfa->inveclosures + i);
				613	if (dfa->edests != NULL)
				614	re_node_set_free (dfa->edests + i);
				615	}
				616	re_free (dfa->edests);
				617	re_free (dfa->eclosures);
				618	re_free (dfa->inveclosures);
				619	re_free (dfa->nodes);
				620
				621	if (dfa->state_table)
				622	for (i = 0; i <= dfa->state_hash_mask; ++i)
				623	{
				624	struct re_state_table_entry *entry = dfa->state_table + i;
				625	for (j = 0; j < entry->num; ++j)
				626	{
				627	re_dfastate_t *state = entry->array[j];
				628	free_state (state);
				629	}
				630	re_free (entry->array);
				631	}
				632	re_free (dfa->state_table);
				633	#ifdef RE_ENABLE_I18N
				634	if (dfa->sb_char != utf8_sb_map)
				635	re_free (dfa->sb_char);
				636	#endif
				637	re_free (dfa->subexp_map);
				638	#ifdef DEBUG
				639	re_free (dfa->re_str);
				640	#endif
				641
				642	re_free (dfa);
				643	}
				644
				645
				646	/* Free dynamically allocated space used by PREG. */
				647
				648	void
Junio C Hamano	178b331	2010-08-19 15:31:30 -0700	[diff] [blame^]	649	regfree (regex_t *preg)
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	650	{
				651	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				652	if (BE (dfa != NULL, 1))
				653	free_dfa_content (dfa);
				654	preg->buffer = NULL;
				655	preg->allocated = 0;
				656
				657	re_free (preg->fastmap);
				658	preg->fastmap = NULL;
				659
				660	re_free (preg->translate);
				661	preg->translate = NULL;
				662	}
				663	#ifdef _LIBC
				664	weak_alias (__regfree, regfree)
				665	#endif
				666
				667	/* Entry points compatible with 4.2 BSD regex library. We don't define
				668	them unless specifically requested. */
				669
				670	#if defined _REGEX_RE_COMP \|\| defined _LIBC
				671
				672	/* BSD has one and only one pattern buffer. */
				673	static struct re_pattern_buffer re_comp_buf;
				674
				675	char *
				676	# ifdef _LIBC
				677	/* Make these definitions weak in libc, so POSIX programs can redefine
				678	these names if they don't use our functions, and still use
				679	regcomp/regexec above without link errors. */
				680	weak_function
				681	# endif
				682	re_comp (s)
				683	const char *s;
				684	{
				685	reg_errcode_t ret;
				686	char *fastmap;
				687
				688	if (!s)
				689	{
				690	if (!re_comp_buf.buffer)
				691	return gettext ("No previous regular expression");
				692	return 0;
				693	}
				694
				695	if (re_comp_buf.buffer)
				696	{
				697	fastmap = re_comp_buf.fastmap;
				698	re_comp_buf.fastmap = NULL;
				699	__regfree (&re_comp_buf);
				700	memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
				701	re_comp_buf.fastmap = fastmap;
				702	}
				703
				704	if (re_comp_buf.fastmap == NULL)
				705	{
				706	re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
				707	if (re_comp_buf.fastmap == NULL)
				708	return (char *) gettext (__re_error_msgid
				709	+ __re_error_msgid_idx[(int) REG_ESPACE]);
				710	}
				711
				712	/* Since `re_exec' always passes NULL for the `regs' argument, we
				713	don't need to initialize the pattern buffer fields which affect it. */
				714
				715	/* Match anchors at newlines. */
				716	re_comp_buf.newline_anchor = 1;
				717
				718	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
				719
				720	if (!ret)
				721	return NULL;
				722
				723	/* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
				724	return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
				725	}
				726
				727	#ifdef _LIBC
				728	libc_freeres_fn (free_mem)
				729	{
				730	__regfree (&re_comp_buf);
				731	}
				732	#endif
				733
				734	#endif /* _REGEX_RE_COMP */
				735
				736	/* Internal entry point.
				737	Compile the regular expression PATTERN, whose length is LENGTH.
				738	SYNTAX indicate regular expression's syntax. */
				739
				740	static reg_errcode_t
				741	re_compile_internal (regex_t preg, const char pattern, size_t length,
				742	reg_syntax_t syntax)
				743	{
				744	reg_errcode_t err = REG_NOERROR;
				745	re_dfa_t *dfa;
				746	re_string_t regexp;
				747
				748	/* Initialize the pattern buffer. */
				749	preg->fastmap_accurate = 0;
				750	preg->syntax = syntax;
				751	preg->not_bol = preg->not_eol = 0;
				752	preg->used = 0;
				753	preg->re_nsub = 0;
				754	preg->can_be_null = 0;
				755	preg->regs_allocated = REGS_UNALLOCATED;
				756
				757	/* Initialize the dfa. */
				758	dfa = (re_dfa_t *) preg->buffer;
				759	if (BE (preg->allocated < sizeof (re_dfa_t), 0))
				760	{
				761	/* If zero allocated, but buffer is non-null, try to realloc
				762	enough space. This loses if buffer's address is bogus, but
				763	that is the user's responsibility. If ->buffer is NULL this
				764	is a simple allocation. */
				765	dfa = re_realloc (preg->buffer, re_dfa_t, 1);
				766	if (dfa == NULL)
				767	return REG_ESPACE;
				768	preg->allocated = sizeof (re_dfa_t);
				769	preg->buffer = (unsigned char *) dfa;
				770	}
				771	preg->used = sizeof (re_dfa_t);
				772
				773	err = init_dfa (dfa, length);
				774	if (BE (err != REG_NOERROR, 0))
				775	{
				776	free_dfa_content (dfa);
				777	preg->buffer = NULL;
				778	preg->allocated = 0;
				779	return err;
				780	}
				781	#ifdef DEBUG
				782	/* Note: length+1 will not overflow since it is checked in init_dfa. */
				783	dfa->re_str = re_malloc (char, length + 1);
				784	strncpy (dfa->re_str, pattern, length + 1);
				785	#endif
				786
				787	__libc_lock_init (dfa->lock);
				788
				789	err = re_string_construct (&regexp, pattern, length, preg->translate,
				790	syntax & RE_ICASE, dfa);
				791	if (BE (err != REG_NOERROR, 0))
				792	{
				793	re_compile_internal_free_return:
				794	free_workarea_compile (preg);
				795	re_string_destruct (&regexp);
				796	free_dfa_content (dfa);
				797	preg->buffer = NULL;
				798	preg->allocated = 0;
				799	return err;
				800	}
				801
				802	/* Parse the regular expression, and build a structure tree. */
				803	preg->re_nsub = 0;
				804	dfa->str_tree = parse (&regexp, preg, syntax, &err);
				805	if (BE (dfa->str_tree == NULL, 0))
				806	goto re_compile_internal_free_return;
				807
				808	/* Analyze the tree and create the nfa. */
				809	err = analyze (preg);
				810	if (BE (err != REG_NOERROR, 0))
				811	goto re_compile_internal_free_return;
				812
				813	#ifdef RE_ENABLE_I18N
				814	/* If possible, do searching in single byte encoding to speed things up. */
				815	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
				816	optimize_utf8 (dfa);
				817	#endif
				818
				819	/* Then create the initial state of the dfa. */
				820	err = create_initial_state (dfa);
				821
				822	/* Release work areas. */
				823	free_workarea_compile (preg);
				824	re_string_destruct (&regexp);
				825
				826	if (BE (err != REG_NOERROR, 0))
				827	{
				828	free_dfa_content (dfa);
				829	preg->buffer = NULL;
				830	preg->allocated = 0;
				831	}
				832
				833	return err;
				834	}
				835
				836	/* Initialize DFA. We use the length of the regular expression PAT_LEN
				837	as the initial length of some arrays. */
				838
				839	static reg_errcode_t
				840	init_dfa (re_dfa_t *dfa, size_t pat_len)
				841	{
				842	unsigned int table_size;
				843	#ifndef _LIBC
				844	char *codeset_name;
				845	#endif
				846
				847	memset (dfa, '\0', sizeof (re_dfa_t));
				848
				849	/* Force allocation of str_tree_storage the first time. */
				850	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
				851
				852	/* Avoid overflows. */
				853	if (pat_len == SIZE_MAX)
				854	return REG_ESPACE;
				855
				856	dfa->nodes_alloc = pat_len + 1;
				857	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
				858
				859	/* table_size = 2 ^ ceil(log pat_len) */
				860	for (table_size = 1; ; table_size <<= 1)
				861	if (table_size > pat_len)
				862	break;
				863
				864	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
				865	dfa->state_hash_mask = table_size - 1;
				866
				867	dfa->mb_cur_max = MB_CUR_MAX;
				868	#ifdef _LIBC
				869	if (dfa->mb_cur_max == 6
				870	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
				871	dfa->is_utf8 = 1;
				872	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
				873	!= 0);
				874	#else
				875	# ifdef HAVE_LANGINFO_CODESET
				876	codeset_name = nl_langinfo (CODESET);
				877	# else
				878	codeset_name = getenv ("LC_ALL");
				879	if (codeset_name == NULL \|\| codeset_name[0] == '\0')
				880	codeset_name = getenv ("LC_CTYPE");
				881	if (codeset_name == NULL \|\| codeset_name[0] == '\0')
				882	codeset_name = getenv ("LANG");
				883	if (codeset_name == NULL)
				884	codeset_name = "";
				885	else if (strchr (codeset_name, '.') != NULL)
				886	codeset_name = strchr (codeset_name, '.') + 1;
				887	# endif
				888
				889	/* strcasecmp isn't a standard interface. brute force check */
				890	#if 0
				891	if (strcasecmp (codeset_name, "UTF-8") == 0
				892	\|\| strcasecmp (codeset_name, "UTF8") == 0)
				893	dfa->is_utf8 = 1;
				894	#else
				895	if ( (codeset_name[0] == 'U' \|\| codeset_name[0] == 'u')
				896	&& (codeset_name[1] == 'T' \|\| codeset_name[1] == 't')
				897	&& (codeset_name[2] == 'F' \|\| codeset_name[2] == 'f')
				898	&& (codeset_name[3] == '-'
				899	? codeset_name[4] == '8' && codeset_name[5] == '\0'
				900	: codeset_name[3] == '8' && codeset_name[4] == '\0'))
				901	dfa->is_utf8 = 1;
				902	#endif
				903
				904	/* We check exhaustively in the loop below if this charset is a
				905	superset of ASCII. */
				906	dfa->map_notascii = 0;
				907	#endif
				908
				909	#ifdef RE_ENABLE_I18N
				910	if (dfa->mb_cur_max > 1)
				911	{
				912	if (dfa->is_utf8)
				913	{
				914	#if !defined(__GNUC__) \|\| __GNUC__ < 3
				915	static short utf8_sb_map_inited = 0;
				916
				917	if (! utf8_sb_map_inited)
				918	{
				919	int i;
				920
				921	utf8_sb_map_inited = 0;
				922	for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
				923	utf8_sb_map[i] = BITSET_WORD_MAX;
				924	}
				925	#endif
				926	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
				927	}
				928	else
				929	{
				930	int i, j, ch;
				931
				932	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
				933	if (BE (dfa->sb_char == NULL, 0))
				934	return REG_ESPACE;
				935
				936	/* Set the bits corresponding to single byte chars. */
				937	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
				938	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
				939	{
				940	wint_t wch = __btowc (ch);
				941	if (wch != WEOF)
				942	dfa->sb_char[i] \|= (bitset_word_t) 1 << j;
				943	# ifndef _LIBC
				944	if (isascii (ch) && wch != ch)
				945	dfa->map_notascii = 1;
				946	# endif
				947	}
				948	}
				949	}
				950	#endif
				951
				952	if (BE (dfa->nodes == NULL \|\| dfa->state_table == NULL, 0))
				953	return REG_ESPACE;
				954	return REG_NOERROR;
				955	}
				956
				957	/* Initialize WORD_CHAR table, which indicate which character is
				958	"word". In this case "word" means that it is the word construction
				959	character used by some operators like "\<", "\>", etc. */
				960
				961	static void
				962	internal_function
				963	init_word_char (re_dfa_t *dfa)
				964	{
				965	int i, j, ch;
				966	dfa->word_ops_used = 1;
				967	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
				968	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
				969	if (isalnum (ch) \|\| ch == '_')
				970	dfa->word_char[i] \|= (bitset_word_t) 1 << j;
				971	}
				972
				973	/* Free the work area which are only used while compiling. */
				974
				975	static void
				976	free_workarea_compile (regex_t *preg)
				977	{
				978	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				979	bin_tree_storage_t storage, next;
				980	for (storage = dfa->str_tree_storage; storage; storage = next)
				981	{
				982	next = storage->next;
				983	re_free (storage);
				984	}
				985	dfa->str_tree_storage = NULL;
				986	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
				987	dfa->str_tree = NULL;
				988	re_free (dfa->org_indices);
				989	dfa->org_indices = NULL;
				990	}
				991
				992	/* Create initial states for all contexts. */
				993
				994	static reg_errcode_t
				995	create_initial_state (re_dfa_t *dfa)
				996	{
				997	int first, i;
				998	reg_errcode_t err;
				999	re_node_set init_nodes;
				1000
				1001	/* Initial states have the epsilon closure of the node which is
				1002	the first node of the regular expression. */
				1003	first = dfa->str_tree->first->node_idx;
				1004	dfa->init_node = first;
				1005	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
				1006	if (BE (err != REG_NOERROR, 0))
				1007	return err;
				1008
				1009	/* The back-references which are in initial states can epsilon transit,
				1010	since in this case all of the subexpressions can be null.
				1011	Then we add epsilon closures of the nodes which are the next nodes of
				1012	the back-references. */
				1013	if (dfa->nbackref > 0)
				1014	for (i = 0; i < init_nodes.nelem; ++i)
				1015	{
				1016	int node_idx = init_nodes.elems[i];
				1017	re_token_type_t type = dfa->nodes[node_idx].type;
				1018
				1019	int clexp_idx;
				1020	if (type != OP_BACK_REF)
				1021	continue;
				1022	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
				1023	{
				1024	re_token_t *clexp_node;
				1025	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
				1026	if (clexp_node->type == OP_CLOSE_SUBEXP
				1027	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
				1028	break;
				1029	}
				1030	if (clexp_idx == init_nodes.nelem)
				1031	continue;
				1032
				1033	if (type == OP_BACK_REF)
				1034	{
				1035	int dest_idx = dfa->edests[node_idx].elems[0];
				1036	if (!re_node_set_contains (&init_nodes, dest_idx))
				1037	{
				1038	reg_errcode_t err = re_node_set_merge (&init_nodes,
				1039	dfa->eclosures
				1040	+ dest_idx);
				1041	if (err != REG_NOERROR)
				1042	return err;
				1043	i = 0;
				1044	}
				1045	}
				1046	}
				1047
				1048	/* It must be the first time to invoke acquire_state. */
				1049	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
				1050	/* We don't check ERR here, since the initial state must not be NULL. */
				1051	if (BE (dfa->init_state == NULL, 0))
				1052	return err;
				1053	if (dfa->init_state->has_constraint)
				1054	{
				1055	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
				1056	CONTEXT_WORD);
				1057	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
				1058	CONTEXT_NEWLINE);
				1059	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
				1060	&init_nodes,
				1061	CONTEXT_NEWLINE
				1062	\| CONTEXT_BEGBUF);
				1063	if (BE (dfa->init_state_word == NULL \|\| dfa->init_state_nl == NULL
				1064	\|\| dfa->init_state_begbuf == NULL, 0))
				1065	return err;
				1066	}
				1067	else
				1068	dfa->init_state_word = dfa->init_state_nl
				1069	= dfa->init_state_begbuf = dfa->init_state;
				1070
				1071	re_node_set_free (&init_nodes);
				1072	return REG_NOERROR;
				1073	}
				1074
				1075	#ifdef RE_ENABLE_I18N
				1076	/* If it is possible to do searching in single byte encoding instead of UTF-8
				1077	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
				1078	DFA nodes where needed. */
				1079
				1080	static void
				1081	optimize_utf8 (re_dfa_t *dfa)
				1082	{
				1083	int node, i, mb_chars = 0, has_period = 0;
				1084
				1085	for (node = 0; node < dfa->nodes_len; ++node)
				1086	switch (dfa->nodes[node].type)
				1087	{
				1088	case CHARACTER:
				1089	if (dfa->nodes[node].opr.c >= 0x80)
				1090	mb_chars = 1;
				1091	break;
				1092	case ANCHOR:
				1093	switch (dfa->nodes[node].opr.ctx_type)
				1094	{
				1095	case LINE_FIRST:
				1096	case LINE_LAST:
				1097	case BUF_FIRST:
				1098	case BUF_LAST:
				1099	break;
				1100	default:
				1101	/* Word anchors etc. cannot be handled. It's okay to test
				1102	opr.ctx_type since constraints (for all DFA nodes) are
				1103	created by ORing one or more opr.ctx_type values. */
				1104	return;
				1105	}
				1106	break;
				1107	case OP_PERIOD:
				1108	has_period = 1;
				1109	break;
				1110	case OP_BACK_REF:
				1111	case OP_ALT:
				1112	case END_OF_RE:
				1113	case OP_DUP_ASTERISK:
				1114	case OP_OPEN_SUBEXP:
				1115	case OP_CLOSE_SUBEXP:
				1116	break;
				1117	case COMPLEX_BRACKET:
				1118	return;
				1119	case SIMPLE_BRACKET:
				1120	/* Just double check. The non-ASCII range starts at 0x80. */
				1121	assert (0x80 % BITSET_WORD_BITS == 0);
				1122	for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
				1123	if (dfa->nodes[node].opr.sbcset[i])
				1124	return;
				1125	break;
				1126	default:
				1127	abort ();
				1128	}
				1129
				1130	if (mb_chars \|\| has_period)
				1131	for (node = 0; node < dfa->nodes_len; ++node)
				1132	{
				1133	if (dfa->nodes[node].type == CHARACTER
				1134	&& dfa->nodes[node].opr.c >= 0x80)
				1135	dfa->nodes[node].mb_partial = 0;
				1136	else if (dfa->nodes[node].type == OP_PERIOD)
				1137	dfa->nodes[node].type = OP_UTF8_PERIOD;
				1138	}
				1139
				1140	/* The search can be in single byte locale. */
				1141	dfa->mb_cur_max = 1;
				1142	dfa->is_utf8 = 0;
				1143	dfa->has_mb_node = dfa->nbackref > 0 \|\| has_period;
				1144	}
				1145	#endif
				1146
				1147	/* Analyze the structure tree, and calculate "first", "next", "edest",
				1148	"eclosure", and "inveclosure". */
				1149
				1150	static reg_errcode_t
				1151	analyze (regex_t *preg)
				1152	{
				1153	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				1154	reg_errcode_t ret;
				1155
				1156	/* Allocate arrays. */
				1157	dfa->nexts = re_malloc (int, dfa->nodes_alloc);
				1158	dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
				1159	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
				1160	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
				1161	if (BE (dfa->nexts == NULL \|\| dfa->org_indices == NULL \|\| dfa->edests == NULL
				1162	\|\| dfa->eclosures == NULL, 0))
				1163	return REG_ESPACE;
				1164
				1165	dfa->subexp_map = re_malloc (int, preg->re_nsub);
				1166	if (dfa->subexp_map != NULL)
				1167	{
				1168	int i;
				1169	for (i = 0; i < preg->re_nsub; i++)
				1170	dfa->subexp_map[i] = i;
				1171	preorder (dfa->str_tree, optimize_subexps, dfa);
				1172	for (i = 0; i < preg->re_nsub; i++)
				1173	if (dfa->subexp_map[i] != i)
				1174	break;
				1175	if (i == preg->re_nsub)
				1176	{
				1177	free (dfa->subexp_map);
				1178	dfa->subexp_map = NULL;
				1179	}
				1180	}
				1181
				1182	ret = postorder (dfa->str_tree, lower_subexps, preg);
				1183	if (BE (ret != REG_NOERROR, 0))
				1184	return ret;
				1185	ret = postorder (dfa->str_tree, calc_first, dfa);
				1186	if (BE (ret != REG_NOERROR, 0))
				1187	return ret;
				1188	preorder (dfa->str_tree, calc_next, dfa);
				1189	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
				1190	if (BE (ret != REG_NOERROR, 0))
				1191	return ret;
				1192	ret = calc_eclosure (dfa);
				1193	if (BE (ret != REG_NOERROR, 0))
				1194	return ret;
				1195
				1196	/* We only need this during the prune_impossible_nodes pass in regexec.c;
				1197	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
				1198	if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
				1199	\|\| dfa->nbackref)
				1200	{
				1201	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
				1202	if (BE (dfa->inveclosures == NULL, 0))
				1203	return REG_ESPACE;
				1204	ret = calc_inveclosure (dfa);
				1205	}
				1206
				1207	return ret;
				1208	}
				1209
				1210	/* Our parse trees are very unbalanced, so we cannot use a stack to
				1211	implement parse tree visits. Instead, we use parent pointers and
				1212	some hairy code in these two functions. */
				1213	static reg_errcode_t
				1214	postorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
				1215	void *extra)
				1216	{
				1217	bin_tree_t node, prev;
				1218
				1219	for (node = root; ; )
				1220	{
				1221	/* Descend down the tree, preferably to the left (or to the right
				1222	if that's the only child). */
				1223	while (node->left \|\| node->right)
				1224	if (node->left)
				1225	node = node->left;
				1226	else
				1227	node = node->right;
				1228
				1229	do
				1230	{
				1231	reg_errcode_t err = fn (extra, node);
				1232	if (BE (err != REG_NOERROR, 0))
				1233	return err;
				1234	if (node->parent == NULL)
				1235	return REG_NOERROR;
				1236	prev = node;
				1237	node = node->parent;
				1238	}
				1239	/* Go up while we have a node that is reached from the right. */
				1240	while (node->right == prev \|\| node->right == NULL);
				1241	node = node->right;
				1242	}
				1243	}
				1244
				1245	static reg_errcode_t
				1246	preorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
				1247	void *extra)
				1248	{
				1249	bin_tree_t *node;
				1250
				1251	for (node = root; ; )
				1252	{
				1253	reg_errcode_t err = fn (extra, node);
				1254	if (BE (err != REG_NOERROR, 0))
				1255	return err;
				1256
				1257	/* Go to the left node, or up and to the right. */
				1258	if (node->left)
				1259	node = node->left;
				1260	else
				1261	{
				1262	bin_tree_t *prev = NULL;
				1263	while (node->right == prev \|\| node->right == NULL)
				1264	{
				1265	prev = node;
				1266	node = node->parent;
				1267	if (!node)
				1268	return REG_NOERROR;
				1269	}
				1270	node = node->right;
				1271	}
				1272	}
				1273	}
				1274
				1275	/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
				1276	re_search_internal to map the inner one's opr.idx to this one's. Adjust
				1277	backreferences as well. Requires a preorder visit. */
				1278	static reg_errcode_t
				1279	optimize_subexps (void extra, bin_tree_t node)
				1280	{
				1281	re_dfa_t dfa = (re_dfa_t ) extra;
				1282
				1283	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
				1284	{
				1285	int idx = node->token.opr.idx;
				1286	node->token.opr.idx = dfa->subexp_map[idx];
				1287	dfa->used_bkref_map \|= 1 << node->token.opr.idx;
				1288	}
				1289
				1290	else if (node->token.type == SUBEXP
				1291	&& node->left && node->left->token.type == SUBEXP)
				1292	{
				1293	int other_idx = node->left->token.opr.idx;
				1294
				1295	node->left = node->left->left;
				1296	if (node->left)
				1297	node->left->parent = node;
				1298
				1299	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
				1300	if (other_idx < BITSET_WORD_BITS)
				1301	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
				1302	}
				1303
				1304	return REG_NOERROR;
				1305	}
				1306
				1307	/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
				1308	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
				1309	static reg_errcode_t
				1310	lower_subexps (void extra, bin_tree_t node)
				1311	{
				1312	regex_t preg = (regex_t ) extra;
				1313	reg_errcode_t err = REG_NOERROR;
				1314
				1315	if (node->left && node->left->token.type == SUBEXP)
				1316	{
				1317	node->left = lower_subexp (&err, preg, node->left);
				1318	if (node->left)
				1319	node->left->parent = node;
				1320	}
				1321	if (node->right && node->right->token.type == SUBEXP)
				1322	{
				1323	node->right = lower_subexp (&err, preg, node->right);
				1324	if (node->right)
				1325	node->right->parent = node;
				1326	}
				1327
				1328	return err;
				1329	}
				1330
				1331	static bin_tree_t *
				1332	lower_subexp (reg_errcode_t err, regex_t preg, bin_tree_t *node)
				1333	{
				1334	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				1335	bin_tree_t *body = node->left;
				1336	bin_tree_t op, cls, tree1, tree;
				1337
				1338	if (preg->no_sub
				1339	/* We do not optimize empty subexpressions, because otherwise we may
				1340	have bad CONCAT nodes with NULL children. This is obviously not
				1341	very common, so we do not lose much. An example that triggers
				1342	this case is the sed "script" //x. */
				1343	&& node->left != NULL
				1344	&& (node->token.opr.idx >= BITSET_WORD_BITS
				1345	\|\| !(dfa->used_bkref_map
				1346	& ((bitset_word_t) 1 << node->token.opr.idx))))
				1347	return node->left;
				1348
				1349	/* Convert the SUBEXP node to the concatenation of an
				1350	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
				1351	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
				1352	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
				1353	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
				1354	tree = create_tree (dfa, op, tree1, CONCAT);
				1355	if (BE (tree == NULL \|\| tree1 == NULL \|\| op == NULL \|\| cls == NULL, 0))
				1356	{
				1357	*err = REG_ESPACE;
				1358	return NULL;
				1359	}
				1360
				1361	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
				1362	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
				1363	return tree;
				1364	}
				1365
				1366	/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
				1367	nodes. Requires a postorder visit. */
				1368	static reg_errcode_t
				1369	calc_first (void extra, bin_tree_t node)
				1370	{
				1371	re_dfa_t dfa = (re_dfa_t ) extra;
				1372	if (node->token.type == CONCAT)
				1373	{
				1374	node->first = node->left->first;
				1375	node->node_idx = node->left->node_idx;
				1376	}
				1377	else
				1378	{
				1379	node->first = node;
				1380	node->node_idx = re_dfa_add_node (dfa, node->token);
				1381	if (BE (node->node_idx == -1, 0))
				1382	return REG_ESPACE;
				1383	if (node->token.type == ANCHOR)
				1384	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
				1385	}
				1386	return REG_NOERROR;
				1387	}
				1388
				1389	/* Pass 2: compute NEXT on the tree. Preorder visit. */
				1390	static reg_errcode_t
				1391	calc_next (void extra, bin_tree_t node)
				1392	{
				1393	switch (node->token.type)
				1394	{
				1395	case OP_DUP_ASTERISK:
				1396	node->left->next = node;
				1397	break;
				1398	case CONCAT:
				1399	node->left->next = node->right->first;
				1400	node->right->next = node->next;
				1401	break;
				1402	default:
				1403	if (node->left)
				1404	node->left->next = node->next;
				1405	if (node->right)
				1406	node->right->next = node->next;
				1407	break;
				1408	}
				1409	return REG_NOERROR;
				1410	}
				1411
				1412	/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
				1413	static reg_errcode_t
				1414	link_nfa_nodes (void extra, bin_tree_t node)
				1415	{
				1416	re_dfa_t dfa = (re_dfa_t ) extra;
				1417	int idx = node->node_idx;
				1418	reg_errcode_t err = REG_NOERROR;
				1419
				1420	switch (node->token.type)
				1421	{
				1422	case CONCAT:
				1423	break;
				1424
				1425	case END_OF_RE:
				1426	assert (node->next == NULL);
				1427	break;
				1428
				1429	case OP_DUP_ASTERISK:
				1430	case OP_ALT:
				1431	{
				1432	int left, right;
				1433	dfa->has_plural_match = 1;
				1434	if (node->left != NULL)
				1435	left = node->left->first->node_idx;
				1436	else
				1437	left = node->next->node_idx;
				1438	if (node->right != NULL)
				1439	right = node->right->first->node_idx;
				1440	else
				1441	right = node->next->node_idx;
				1442	assert (left > -1);
				1443	assert (right > -1);
				1444	err = re_node_set_init_2 (dfa->edests + idx, left, right);
				1445	}
				1446	break;
				1447
				1448	case ANCHOR:
				1449	case OP_OPEN_SUBEXP:
				1450	case OP_CLOSE_SUBEXP:
				1451	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
				1452	break;
				1453
				1454	case OP_BACK_REF:
				1455	dfa->nexts[idx] = node->next->node_idx;
				1456	if (node->token.type == OP_BACK_REF)
				1457	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
				1458	break;
				1459
				1460	default:
				1461	assert (!IS_EPSILON_NODE (node->token.type));
				1462	dfa->nexts[idx] = node->next->node_idx;
				1463	break;
				1464	}
				1465
				1466	return err;
				1467	}
				1468
				1469	/* Duplicate the epsilon closure of the node ROOT_NODE.
				1470	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
				1471	to their own constraint. */
				1472
				1473	static reg_errcode_t
				1474	internal_function
				1475	duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
				1476	int root_node, unsigned int init_constraint)
				1477	{
				1478	int org_node, clone_node, ret;
				1479	unsigned int constraint = init_constraint;
				1480	for (org_node = top_org_node, clone_node = top_clone_node;;)
				1481	{
				1482	int org_dest, clone_dest;
				1483	if (dfa->nodes[org_node].type == OP_BACK_REF)
				1484	{
				1485	/* If the back reference epsilon-transit, its destination must
				1486	also have the constraint. Then duplicate the epsilon closure
				1487	of the destination of the back reference, and store it in
				1488	edests of the back reference. */
				1489	org_dest = dfa->nexts[org_node];
				1490	re_node_set_empty (dfa->edests + clone_node);
				1491	clone_dest = duplicate_node (dfa, org_dest, constraint);
				1492	if (BE (clone_dest == -1, 0))
				1493	return REG_ESPACE;
				1494	dfa->nexts[clone_node] = dfa->nexts[org_node];
				1495	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
				1496	if (BE (ret < 0, 0))
				1497	return REG_ESPACE;
				1498	}
				1499	else if (dfa->edests[org_node].nelem == 0)
				1500	{
				1501	/* In case of the node can't epsilon-transit, don't duplicate the
				1502	destination and store the original destination as the
				1503	destination of the node. */
				1504	dfa->nexts[clone_node] = dfa->nexts[org_node];
				1505	break;
				1506	}
				1507	else if (dfa->edests[org_node].nelem == 1)
				1508	{
				1509	/* In case of the node can epsilon-transit, and it has only one
				1510	destination. */
				1511	org_dest = dfa->edests[org_node].elems[0];
				1512	re_node_set_empty (dfa->edests + clone_node);
				1513	/* If the node is root_node itself, it means the epsilon clsoure
				1514	has a loop. Then tie it to the destination of the root_node. */
				1515	if (org_node == root_node && clone_node != org_node)
				1516	{
				1517	ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
				1518	if (BE (ret < 0, 0))
				1519	return REG_ESPACE;
				1520	break;
				1521	}
				1522	/* In case of the node has another constraint, add it. */
				1523	constraint \|= dfa->nodes[org_node].constraint;
				1524	clone_dest = duplicate_node (dfa, org_dest, constraint);
				1525	if (BE (clone_dest == -1, 0))
				1526	return REG_ESPACE;
				1527	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
				1528	if (BE (ret < 0, 0))
				1529	return REG_ESPACE;
				1530	}
				1531	else /* dfa->edests[org_node].nelem == 2 */
				1532	{
				1533	/* In case of the node can epsilon-transit, and it has two
				1534	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
				1535	org_dest = dfa->edests[org_node].elems[0];
				1536	re_node_set_empty (dfa->edests + clone_node);
				1537	/* Search for a duplicated node which satisfies the constraint. */
				1538	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
				1539	if (clone_dest == -1)
				1540	{
				1541	/* There is no such duplicated node, create a new one. */
				1542	reg_errcode_t err;
				1543	clone_dest = duplicate_node (dfa, org_dest, constraint);
				1544	if (BE (clone_dest == -1, 0))
				1545	return REG_ESPACE;
				1546	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
				1547	if (BE (ret < 0, 0))
				1548	return REG_ESPACE;
				1549	err = duplicate_node_closure (dfa, org_dest, clone_dest,
				1550	root_node, constraint);
				1551	if (BE (err != REG_NOERROR, 0))
				1552	return err;
				1553	}
				1554	else
				1555	{
				1556	/* There is a duplicated node which satisfies the constraint,
				1557	use it to avoid infinite loop. */
				1558	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
				1559	if (BE (ret < 0, 0))
				1560	return REG_ESPACE;
				1561	}
				1562
				1563	org_dest = dfa->edests[org_node].elems[1];
				1564	clone_dest = duplicate_node (dfa, org_dest, constraint);
				1565	if (BE (clone_dest == -1, 0))
				1566	return REG_ESPACE;
				1567	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
				1568	if (BE (ret < 0, 0))
				1569	return REG_ESPACE;
				1570	}
				1571	org_node = org_dest;
				1572	clone_node = clone_dest;
				1573	}
				1574	return REG_NOERROR;
				1575	}
				1576
				1577	/* Search for a node which is duplicated from the node ORG_NODE, and
				1578	satisfies the constraint CONSTRAINT. */
				1579
				1580	static int
				1581	search_duplicated_node (const re_dfa_t *dfa, int org_node,
				1582	unsigned int constraint)
				1583	{
				1584	int idx;
				1585	for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
				1586	{
				1587	if (org_node == dfa->org_indices[idx]
				1588	&& constraint == dfa->nodes[idx].constraint)
				1589	return idx; /* Found. */
				1590	}
				1591	return -1; /* Not found. */
				1592	}
				1593
				1594	/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
				1595	Return the index of the new node, or -1 if insufficient storage is
				1596	available. */
				1597
				1598	static int
				1599	duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
				1600	{
				1601	int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
				1602	if (BE (dup_idx != -1, 1))
				1603	{
				1604	dfa->nodes[dup_idx].constraint = constraint;
				1605	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].constraint;
				1606	dfa->nodes[dup_idx].duplicated = 1;
				1607
				1608	/* Store the index of the original node. */
				1609	dfa->org_indices[dup_idx] = org_idx;
				1610	}
				1611	return dup_idx;
				1612	}
				1613
				1614	static reg_errcode_t
				1615	calc_inveclosure (re_dfa_t *dfa)
				1616	{
				1617	int src, idx, ret;
				1618	for (idx = 0; idx < dfa->nodes_len; ++idx)
				1619	re_node_set_init_empty (dfa->inveclosures + idx);
				1620
				1621	for (src = 0; src < dfa->nodes_len; ++src)
				1622	{
				1623	int *elems = dfa->eclosures[src].elems;
				1624	for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
				1625	{
				1626	ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
				1627	if (BE (ret == -1, 0))
				1628	return REG_ESPACE;
				1629	}
				1630	}
				1631
				1632	return REG_NOERROR;
				1633	}
				1634
				1635	/* Calculate "eclosure" for all the node in DFA. */
				1636
				1637	static reg_errcode_t
				1638	calc_eclosure (re_dfa_t *dfa)
				1639	{
				1640	int node_idx, incomplete;
				1641	#ifdef DEBUG
				1642	assert (dfa->nodes_len > 0);
				1643	#endif
				1644	incomplete = 0;
				1645	/* For each nodes, calculate epsilon closure. */
				1646	for (node_idx = 0; ; ++node_idx)
				1647	{
				1648	reg_errcode_t err;
				1649	re_node_set eclosure_elem;
				1650	if (node_idx == dfa->nodes_len)
				1651	{
				1652	if (!incomplete)
				1653	break;
				1654	incomplete = 0;
				1655	node_idx = 0;
				1656	}
				1657
				1658	#ifdef DEBUG
				1659	assert (dfa->eclosures[node_idx].nelem != -1);
				1660	#endif
				1661
				1662	/* If we have already calculated, skip it. */
				1663	if (dfa->eclosures[node_idx].nelem != 0)
				1664	continue;
				1665	/* Calculate epsilon closure of `node_idx'. */
				1666	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
				1667	if (BE (err != REG_NOERROR, 0))
				1668	return err;
				1669
				1670	if (dfa->eclosures[node_idx].nelem == 0)
				1671	{
				1672	incomplete = 1;
				1673	re_node_set_free (&eclosure_elem);
				1674	}
				1675	}
				1676	return REG_NOERROR;
				1677	}
				1678
				1679	/* Calculate epsilon closure of NODE. */
				1680
				1681	static reg_errcode_t
				1682	calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa, int node, int root)
				1683	{
				1684	reg_errcode_t err;
				1685	int i;
				1686	re_node_set eclosure;
				1687	int ret;
				1688	int incomplete = 0;
				1689	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
				1690	if (BE (err != REG_NOERROR, 0))
				1691	return err;
				1692
				1693	/* This indicates that we are calculating this node now.
				1694	We reference this value to avoid infinite loop. */
				1695	dfa->eclosures[node].nelem = -1;
				1696
				1697	/* If the current node has constraints, duplicate all nodes
				1698	since they must inherit the constraints. */
				1699	if (dfa->nodes[node].constraint
				1700	&& dfa->edests[node].nelem
				1701	&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
				1702	{
				1703	err = duplicate_node_closure (dfa, node, node, node,
				1704	dfa->nodes[node].constraint);
				1705	if (BE (err != REG_NOERROR, 0))
				1706	return err;
				1707	}
				1708
				1709	/* Expand each epsilon destination nodes. */
				1710	if (IS_EPSILON_NODE(dfa->nodes[node].type))
				1711	for (i = 0; i < dfa->edests[node].nelem; ++i)
				1712	{
				1713	re_node_set eclosure_elem;
				1714	int edest = dfa->edests[node].elems[i];
				1715	/* If calculating the epsilon closure of `edest' is in progress,
				1716	return intermediate result. */
				1717	if (dfa->eclosures[edest].nelem == -1)
				1718	{
				1719	incomplete = 1;
				1720	continue;
				1721	}
				1722	/* If we haven't calculated the epsilon closure of `edest' yet,
				1723	calculate now. Otherwise use calculated epsilon closure. */
				1724	if (dfa->eclosures[edest].nelem == 0)
				1725	{
				1726	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
				1727	if (BE (err != REG_NOERROR, 0))
				1728	return err;
				1729	}
				1730	else
				1731	eclosure_elem = dfa->eclosures[edest];
				1732	/* Merge the epsilon closure of `edest'. */
				1733	err = re_node_set_merge (&eclosure, &eclosure_elem);
				1734	if (BE (err != REG_NOERROR, 0))
				1735	return err;
				1736	/* If the epsilon closure of `edest' is incomplete,
				1737	the epsilon closure of this node is also incomplete. */
				1738	if (dfa->eclosures[edest].nelem == 0)
				1739	{
				1740	incomplete = 1;
				1741	re_node_set_free (&eclosure_elem);
				1742	}
				1743	}
				1744
				1745	/* An epsilon closure includes itself. */
				1746	ret = re_node_set_insert (&eclosure, node);
				1747	if (BE (ret < 0, 0))
				1748	return REG_ESPACE;
				1749	if (incomplete && !root)
				1750	dfa->eclosures[node].nelem = 0;
				1751	else
				1752	dfa->eclosures[node] = eclosure;
				1753	*new_set = eclosure;
				1754	return REG_NOERROR;
				1755	}
				1756
				1757	/* Functions for token which are used in the parser. */
				1758
				1759	/* Fetch a token from INPUT.
				1760	We must not use this function inside bracket expressions. */
				1761
				1762	static void
				1763	internal_function
				1764	fetch_token (re_token_t result, re_string_t input, reg_syntax_t syntax)
				1765	{
				1766	re_string_skip_bytes (input, peek_token (result, input, syntax));
				1767	}
				1768
				1769	/* Peek a token from INPUT, and return the length of the token.
				1770	We must not use this function inside bracket expressions. */
				1771
				1772	static int
				1773	internal_function
				1774	peek_token (re_token_t token, re_string_t input, reg_syntax_t syntax)
				1775	{
				1776	unsigned char c;
				1777
				1778	if (re_string_eoi (input))
				1779	{
				1780	token->type = END_OF_RE;
				1781	return 0;
				1782	}
				1783
				1784	c = re_string_peek_byte (input, 0);
				1785	token->opr.c = c;
				1786
				1787	token->word_char = 0;
				1788	#ifdef RE_ENABLE_I18N
				1789	token->mb_partial = 0;
				1790	if (input->mb_cur_max > 1 &&
				1791	!re_string_first_byte (input, re_string_cur_idx (input)))
				1792	{
				1793	token->type = CHARACTER;
				1794	token->mb_partial = 1;
				1795	return 1;
				1796	}
				1797	#endif
				1798	if (c == '\\')
				1799	{
				1800	unsigned char c2;
				1801	if (re_string_cur_idx (input) + 1 >= re_string_length (input))
				1802	{
				1803	token->type = BACK_SLASH;
				1804	return 1;
				1805	}
				1806
				1807	c2 = re_string_peek_byte_case (input, 1);
				1808	token->opr.c = c2;
				1809	token->type = CHARACTER;
				1810	#ifdef RE_ENABLE_I18N
				1811	if (input->mb_cur_max > 1)
				1812	{
				1813	wint_t wc = re_string_wchar_at (input,
				1814	re_string_cur_idx (input) + 1);
				1815	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
				1816	}
				1817	else
				1818	#endif
				1819	token->word_char = IS_WORD_CHAR (c2) != 0;
				1820
				1821	switch (c2)
				1822	{
				1823	case '\|':
				1824	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
				1825	token->type = OP_ALT;
				1826	break;
				1827	case '1': case '2': case '3': case '4': case '5':
				1828	case '6': case '7': case '8': case '9':
				1829	if (!(syntax & RE_NO_BK_REFS))
				1830	{
				1831	token->type = OP_BACK_REF;
				1832	token->opr.idx = c2 - '1';
				1833	}
				1834	break;
				1835	case '<':
				1836	if (!(syntax & RE_NO_GNU_OPS))
				1837	{
				1838	token->type = ANCHOR;
				1839	token->opr.ctx_type = WORD_FIRST;
				1840	}
				1841	break;
				1842	case '>':
				1843	if (!(syntax & RE_NO_GNU_OPS))
				1844	{
				1845	token->type = ANCHOR;
				1846	token->opr.ctx_type = WORD_LAST;
				1847	}
				1848	break;
				1849	case 'b':
				1850	if (!(syntax & RE_NO_GNU_OPS))
				1851	{
				1852	token->type = ANCHOR;
				1853	token->opr.ctx_type = WORD_DELIM;
				1854	}
				1855	break;
				1856	case 'B':
				1857	if (!(syntax & RE_NO_GNU_OPS))
				1858	{
				1859	token->type = ANCHOR;
				1860	token->opr.ctx_type = NOT_WORD_DELIM;
				1861	}
				1862	break;
				1863	case 'w':
				1864	if (!(syntax & RE_NO_GNU_OPS))
				1865	token->type = OP_WORD;
				1866	break;
				1867	case 'W':
				1868	if (!(syntax & RE_NO_GNU_OPS))
				1869	token->type = OP_NOTWORD;
				1870	break;
				1871	case 's':
				1872	if (!(syntax & RE_NO_GNU_OPS))
				1873	token->type = OP_SPACE;
				1874	break;
				1875	case 'S':
				1876	if (!(syntax & RE_NO_GNU_OPS))
				1877	token->type = OP_NOTSPACE;
				1878	break;
				1879	case '`':
				1880	if (!(syntax & RE_NO_GNU_OPS))
				1881	{
				1882	token->type = ANCHOR;
				1883	token->opr.ctx_type = BUF_FIRST;
				1884	}
				1885	break;
				1886	case '\'':
				1887	if (!(syntax & RE_NO_GNU_OPS))
				1888	{
				1889	token->type = ANCHOR;
				1890	token->opr.ctx_type = BUF_LAST;
				1891	}
				1892	break;
				1893	case '(':
				1894	if (!(syntax & RE_NO_BK_PARENS))
				1895	token->type = OP_OPEN_SUBEXP;
				1896	break;
				1897	case ')':
				1898	if (!(syntax & RE_NO_BK_PARENS))
				1899	token->type = OP_CLOSE_SUBEXP;
				1900	break;
				1901	case '+':
				1902	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
				1903	token->type = OP_DUP_PLUS;
				1904	break;
				1905	case '?':
				1906	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
				1907	token->type = OP_DUP_QUESTION;
				1908	break;
				1909	case '{':
				1910	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
				1911	token->type = OP_OPEN_DUP_NUM;
				1912	break;
				1913	case '}':
				1914	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
				1915	token->type = OP_CLOSE_DUP_NUM;
				1916	break;
				1917	default:
				1918	break;
				1919	}
				1920	return 2;
				1921	}
				1922
				1923	token->type = CHARACTER;
				1924	#ifdef RE_ENABLE_I18N
				1925	if (input->mb_cur_max > 1)
				1926	{
				1927	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
				1928	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
				1929	}
				1930	else
				1931	#endif
				1932	token->word_char = IS_WORD_CHAR (token->opr.c);
				1933
				1934	switch (c)
				1935	{
				1936	case '\n':
				1937	if (syntax & RE_NEWLINE_ALT)
				1938	token->type = OP_ALT;
				1939	break;
				1940	case '\|':
				1941	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
				1942	token->type = OP_ALT;
				1943	break;
				1944	case '*':
				1945	token->type = OP_DUP_ASTERISK;
				1946	break;
				1947	case '+':
				1948	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
				1949	token->type = OP_DUP_PLUS;
				1950	break;
				1951	case '?':
				1952	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
				1953	token->type = OP_DUP_QUESTION;
				1954	break;
				1955	case '{':
				1956	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
				1957	token->type = OP_OPEN_DUP_NUM;
				1958	break;
				1959	case '}':
				1960	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
				1961	token->type = OP_CLOSE_DUP_NUM;
				1962	break;
				1963	case '(':
				1964	if (syntax & RE_NO_BK_PARENS)
				1965	token->type = OP_OPEN_SUBEXP;
				1966	break;
				1967	case ')':
				1968	if (syntax & RE_NO_BK_PARENS)
				1969	token->type = OP_CLOSE_SUBEXP;
				1970	break;
				1971	case '[':
				1972	token->type = OP_OPEN_BRACKET;
				1973	break;
				1974	case '.':
				1975	token->type = OP_PERIOD;
				1976	break;
				1977	case '^':
				1978	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE)) &&
				1979	re_string_cur_idx (input) != 0)
				1980	{
				1981	char prev = re_string_peek_byte (input, -1);
				1982	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != '\n')
				1983	break;
				1984	}
				1985	token->type = ANCHOR;
				1986	token->opr.ctx_type = LINE_FIRST;
				1987	break;
				1988	case '$':
				1989	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
				1990	re_string_cur_idx (input) + 1 != re_string_length (input))
				1991	{
				1992	re_token_t next;
				1993	re_string_skip_bytes (input, 1);
				1994	peek_token (&next, input, syntax);
				1995	re_string_skip_bytes (input, -1);
				1996	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
				1997	break;
				1998	}
				1999	token->type = ANCHOR;
				2000	token->opr.ctx_type = LINE_LAST;
				2001	break;
				2002	default:
				2003	break;
				2004	}
				2005	return 1;
				2006	}
				2007
				2008	/* Peek a token from INPUT, and return the length of the token.
				2009	We must not use this function out of bracket expressions. */
				2010
				2011	static int
				2012	internal_function
				2013	peek_token_bracket (re_token_t token, re_string_t input, reg_syntax_t syntax)
				2014	{
				2015	unsigned char c;
				2016	if (re_string_eoi (input))
				2017	{
				2018	token->type = END_OF_RE;
				2019	return 0;
				2020	}
				2021	c = re_string_peek_byte (input, 0);
				2022	token->opr.c = c;
				2023
				2024	#ifdef RE_ENABLE_I18N
				2025	if (input->mb_cur_max > 1 &&
				2026	!re_string_first_byte (input, re_string_cur_idx (input)))
				2027	{
				2028	token->type = CHARACTER;
				2029	return 1;
				2030	}
				2031	#endif /* RE_ENABLE_I18N */
				2032
				2033	if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
				2034	&& re_string_cur_idx (input) + 1 < re_string_length (input))
				2035	{
				2036	/* In this case, '\' escape a character. */
				2037	unsigned char c2;
				2038	re_string_skip_bytes (input, 1);
				2039	c2 = re_string_peek_byte (input, 0);
				2040	token->opr.c = c2;
				2041	token->type = CHARACTER;
				2042	return 1;
				2043	}
				2044	if (c == '[') /* '[' is a special char in a bracket exps. */
				2045	{
				2046	unsigned char c2;
				2047	int token_len;
				2048	if (re_string_cur_idx (input) + 1 < re_string_length (input))
				2049	c2 = re_string_peek_byte (input, 1);
				2050	else
				2051	c2 = 0;
				2052	token->opr.c = c2;
				2053	token_len = 2;
				2054	switch (c2)
				2055	{
				2056	case '.':
				2057	token->type = OP_OPEN_COLL_ELEM;
				2058	break;
				2059	case '=':
				2060	token->type = OP_OPEN_EQUIV_CLASS;
				2061	break;
				2062	case ':':
				2063	if (syntax & RE_CHAR_CLASSES)
				2064	{
				2065	token->type = OP_OPEN_CHAR_CLASS;
				2066	break;
				2067	}
				2068	/* else fall through. */
				2069	default:
				2070	token->type = CHARACTER;
				2071	token->opr.c = c;
				2072	token_len = 1;
				2073	break;
				2074	}
				2075	return token_len;
				2076	}
				2077	switch (c)
				2078	{
				2079	case '-':
				2080	token->type = OP_CHARSET_RANGE;
				2081	break;
				2082	case ']':
				2083	token->type = OP_CLOSE_BRACKET;
				2084	break;
				2085	case '^':
				2086	token->type = OP_NON_MATCH_LIST;
				2087	break;
				2088	default:
				2089	token->type = CHARACTER;
				2090	}
				2091	return 1;
				2092	}
				2093
				2094	/* Functions for parser. */
				2095
				2096	/* Entry point of the parser.
				2097	Parse the regular expression REGEXP and return the structure tree.
				2098	If an error is occured, ERR is set by error code, and return NULL.
				2099	This function build the following tree, from regular expression <reg_exp>:
				2100	CAT
				2101	/ \
				2102	/ \
				2103	<reg_exp> EOR
				2104
				2105	CAT means concatenation.
				2106	EOR means end of regular expression. */
				2107
				2108	static bin_tree_t *
				2109	parse (re_string_t regexp, regex_t preg, reg_syntax_t syntax,
				2110	reg_errcode_t *err)
				2111	{
				2112	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				2113	bin_tree_t tree, eor, *root;
				2114	re_token_t current_token;
				2115	dfa->syntax = syntax;
				2116	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
				2117	tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
				2118	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2119	return NULL;
				2120	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
				2121	if (tree != NULL)
				2122	root = create_tree (dfa, tree, eor, CONCAT);
				2123	else
				2124	root = eor;
				2125	if (BE (eor == NULL \|\| root == NULL, 0))
				2126	{
				2127	*err = REG_ESPACE;
				2128	return NULL;
				2129	}
				2130	return root;
				2131	}
				2132
				2133	/* This function build the following tree, from regular expression
				2134	<branch1>\|<branch2>:
				2135	ALT
				2136	/ \
				2137	/ \
				2138	<branch1> <branch2>
				2139
				2140	ALT means alternative, which represents the operator `\|'. */
				2141
				2142	static bin_tree_t *
				2143	parse_reg_exp (re_string_t regexp, regex_t preg, re_token_t *token,
				2144	reg_syntax_t syntax, int nest, reg_errcode_t *err)
				2145	{
				2146	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				2147	bin_tree_t tree, branch = NULL;
				2148	tree = parse_branch (regexp, preg, token, syntax, nest, err);
				2149	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2150	return NULL;
				2151
				2152	while (token->type == OP_ALT)
				2153	{
				2154	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
				2155	if (token->type != OP_ALT && token->type != END_OF_RE
				2156	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
				2157	{
				2158	branch = parse_branch (regexp, preg, token, syntax, nest, err);
				2159	if (BE (*err != REG_NOERROR && branch == NULL, 0))
				2160	return NULL;
				2161	}
				2162	else
				2163	branch = NULL;
				2164	tree = create_tree (dfa, tree, branch, OP_ALT);
				2165	if (BE (tree == NULL, 0))
				2166	{
				2167	*err = REG_ESPACE;
				2168	return NULL;
				2169	}
				2170	}
				2171	return tree;
				2172	}
				2173
				2174	/* This function build the following tree, from regular expression
				2175	<exp1><exp2>:
				2176	CAT
				2177	/ \
				2178	/ \
				2179	<exp1> <exp2>
				2180
				2181	CAT means concatenation. */
				2182
				2183	static bin_tree_t *
				2184	parse_branch (re_string_t regexp, regex_t preg, re_token_t *token,
				2185	reg_syntax_t syntax, int nest, reg_errcode_t *err)
				2186	{
				2187	bin_tree_t tree, exp;
				2188	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				2189	tree = parse_expression (regexp, preg, token, syntax, nest, err);
				2190	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2191	return NULL;
				2192
				2193	while (token->type != OP_ALT && token->type != END_OF_RE
				2194	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
				2195	{
				2196	exp = parse_expression (regexp, preg, token, syntax, nest, err);
				2197	if (BE (*err != REG_NOERROR && exp == NULL, 0))
				2198	{
				2199	return NULL;
				2200	}
				2201	if (tree != NULL && exp != NULL)
				2202	{
				2203	tree = create_tree (dfa, tree, exp, CONCAT);
				2204	if (tree == NULL)
				2205	{
				2206	*err = REG_ESPACE;
				2207	return NULL;
				2208	}
				2209	}
				2210	else if (tree == NULL)
				2211	tree = exp;
				2212	/* Otherwise exp == NULL, we don't need to create new tree. */
				2213	}
				2214	return tree;
				2215	}
				2216
				2217	/* This function build the following tree, from regular expression a*:
				2218	*
				2219	\|
				2220	a
				2221	*/
				2222
				2223	static bin_tree_t *
				2224	parse_expression (re_string_t regexp, regex_t preg, re_token_t *token,
				2225	reg_syntax_t syntax, int nest, reg_errcode_t *err)
				2226	{
				2227	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				2228	bin_tree_t *tree;
				2229	switch (token->type)
				2230	{
				2231	case CHARACTER:
				2232	tree = create_token_tree (dfa, NULL, NULL, token);
				2233	if (BE (tree == NULL, 0))
				2234	{
				2235	*err = REG_ESPACE;
				2236	return NULL;
				2237	}
				2238	#ifdef RE_ENABLE_I18N
				2239	if (dfa->mb_cur_max > 1)
				2240	{
				2241	while (!re_string_eoi (regexp)
				2242	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
				2243	{
				2244	bin_tree_t *mbc_remain;
				2245	fetch_token (token, regexp, syntax);
				2246	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
				2247	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
				2248	if (BE (mbc_remain == NULL \|\| tree == NULL, 0))
				2249	{
				2250	*err = REG_ESPACE;
				2251	return NULL;
				2252	}
				2253	}
				2254	}
				2255	#endif
				2256	break;
				2257	case OP_OPEN_SUBEXP:
				2258	tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
				2259	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2260	return NULL;
				2261	break;
				2262	case OP_OPEN_BRACKET:
				2263	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
				2264	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2265	return NULL;
				2266	break;
				2267	case OP_BACK_REF:
				2268	if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
				2269	{
				2270	*err = REG_ESUBREG;
				2271	return NULL;
				2272	}
				2273	dfa->used_bkref_map \|= 1 << token->opr.idx;
				2274	tree = create_token_tree (dfa, NULL, NULL, token);
				2275	if (BE (tree == NULL, 0))
				2276	{
				2277	*err = REG_ESPACE;
				2278	return NULL;
				2279	}
				2280	++dfa->nbackref;
				2281	dfa->has_mb_node = 1;
				2282	break;
				2283	case OP_OPEN_DUP_NUM:
				2284	if (syntax & RE_CONTEXT_INVALID_DUP)
				2285	{
				2286	*err = REG_BADRPT;
				2287	return NULL;
				2288	}
				2289	/* FALLTHROUGH */
				2290	case OP_DUP_ASTERISK:
				2291	case OP_DUP_PLUS:
				2292	case OP_DUP_QUESTION:
				2293	if (syntax & RE_CONTEXT_INVALID_OPS)
				2294	{
				2295	*err = REG_BADRPT;
				2296	return NULL;
				2297	}
				2298	else if (syntax & RE_CONTEXT_INDEP_OPS)
				2299	{
				2300	fetch_token (token, regexp, syntax);
				2301	return parse_expression (regexp, preg, token, syntax, nest, err);
				2302	}
				2303	/* else fall through */
				2304	case OP_CLOSE_SUBEXP:
				2305	if ((token->type == OP_CLOSE_SUBEXP) &&
				2306	!(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
				2307	{
				2308	*err = REG_ERPAREN;
				2309	return NULL;
				2310	}
				2311	/* else fall through */
				2312	case OP_CLOSE_DUP_NUM:
				2313	/* We treat it as a normal character. */
				2314
				2315	/* Then we can these characters as normal characters. */
				2316	token->type = CHARACTER;
				2317	/* mb_partial and word_char bits should be initialized already
				2318	by peek_token. */
				2319	tree = create_token_tree (dfa, NULL, NULL, token);
				2320	if (BE (tree == NULL, 0))
				2321	{
				2322	*err = REG_ESPACE;
				2323	return NULL;
				2324	}
				2325	break;
				2326	case ANCHOR:
				2327	if ((token->opr.ctx_type
				2328	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
				2329	&& dfa->word_ops_used == 0)
				2330	init_word_char (dfa);
				2331	if (token->opr.ctx_type == WORD_DELIM
				2332	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
				2333	{
				2334	bin_tree_t tree_first, tree_last;
				2335	if (token->opr.ctx_type == WORD_DELIM)
				2336	{
				2337	token->opr.ctx_type = WORD_FIRST;
				2338	tree_first = create_token_tree (dfa, NULL, NULL, token);
				2339	token->opr.ctx_type = WORD_LAST;
				2340	}
				2341	else
				2342	{
				2343	token->opr.ctx_type = INSIDE_WORD;
				2344	tree_first = create_token_tree (dfa, NULL, NULL, token);
				2345	token->opr.ctx_type = INSIDE_NOTWORD;
				2346	}
				2347	tree_last = create_token_tree (dfa, NULL, NULL, token);
				2348	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
				2349	if (BE (tree_first == NULL \|\| tree_last == NULL \|\| tree == NULL, 0))
				2350	{
				2351	*err = REG_ESPACE;
				2352	return NULL;
				2353	}
				2354	}
				2355	else
				2356	{
				2357	tree = create_token_tree (dfa, NULL, NULL, token);
				2358	if (BE (tree == NULL, 0))
				2359	{
				2360	*err = REG_ESPACE;
				2361	return NULL;
				2362	}
				2363	}
				2364	/* We must return here, since ANCHORs can't be followed
				2365	by repetition operators.
				2366	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
				2367	it must not be "<ANCHOR(^)><REPEAT()>". /
				2368	fetch_token (token, regexp, syntax);
				2369	return tree;
				2370	case OP_PERIOD:
				2371	tree = create_token_tree (dfa, NULL, NULL, token);
				2372	if (BE (tree == NULL, 0))
				2373	{
				2374	*err = REG_ESPACE;
				2375	return NULL;
				2376	}
				2377	if (dfa->mb_cur_max > 1)
				2378	dfa->has_mb_node = 1;
				2379	break;
				2380	case OP_WORD:
				2381	case OP_NOTWORD:
				2382	tree = build_charclass_op (dfa, regexp->trans,
				2383	"alnum",
				2384	"_",
				2385	token->type == OP_NOTWORD, err);
				2386	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2387	return NULL;
				2388	break;
				2389	case OP_SPACE:
				2390	case OP_NOTSPACE:
				2391	tree = build_charclass_op (dfa, regexp->trans,
				2392	"space",
				2393	"",
				2394	token->type == OP_NOTSPACE, err);
				2395	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2396	return NULL;
				2397	break;
				2398	case OP_ALT:
				2399	case END_OF_RE:
				2400	return NULL;
				2401	case BACK_SLASH:
				2402	*err = REG_EESCAPE;
				2403	return NULL;
				2404	default:
				2405	/* Must not happen? */
				2406	#ifdef DEBUG
				2407	assert (0);
				2408	#endif
				2409	return NULL;
				2410	}
				2411	fetch_token (token, regexp, syntax);
				2412
				2413	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
				2414	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
				2415	{
				2416	tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
				2417	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2418	return NULL;
				2419	/* In BRE consecutive duplications are not allowed. */
				2420	if ((syntax & RE_CONTEXT_INVALID_DUP)
				2421	&& (token->type == OP_DUP_ASTERISK
				2422	\|\| token->type == OP_OPEN_DUP_NUM))
				2423	{
				2424	*err = REG_BADRPT;
				2425	return NULL;
				2426	}
				2427	}
				2428
				2429	return tree;
				2430	}
				2431
				2432	/* This function build the following tree, from regular expression
				2433	(<reg_exp>):
				2434	SUBEXP
				2435	\|
				2436	<reg_exp>
				2437	*/
				2438
				2439	static bin_tree_t *
				2440	parse_sub_exp (re_string_t regexp, regex_t preg, re_token_t *token,
				2441	reg_syntax_t syntax, int nest, reg_errcode_t *err)
				2442	{
				2443	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				2444	bin_tree_t *tree;
				2445	size_t cur_nsub;
				2446	cur_nsub = preg->re_nsub++;
				2447
				2448	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
				2449
				2450	/* The subexpression may be a null string. */
				2451	if (token->type == OP_CLOSE_SUBEXP)
				2452	tree = NULL;
				2453	else
				2454	{
				2455	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
				2456	if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
				2457	*err = REG_EPAREN;
				2458	if (BE (*err != REG_NOERROR, 0))
				2459	return NULL;
				2460	}
				2461
				2462	if (cur_nsub <= '9' - '1')
				2463	dfa->completed_bkref_map \|= 1 << cur_nsub;
				2464
				2465	tree = create_tree (dfa, tree, NULL, SUBEXP);
				2466	if (BE (tree == NULL, 0))
				2467	{
				2468	*err = REG_ESPACE;
				2469	return NULL;
				2470	}
				2471	tree->token.opr.idx = cur_nsub;
				2472	return tree;
				2473	}
				2474
				2475	/* This function parse repetition operators like "", "+", "{1,3}" etc. /
				2476
				2477	static bin_tree_t *
				2478	parse_dup_op (bin_tree_t elem, re_string_t regexp, re_dfa_t *dfa,
				2479	re_token_t token, reg_syntax_t syntax, reg_errcode_t err)
				2480	{
				2481	bin_tree_t tree = NULL, old_tree = NULL;
				2482	int i, start, end, start_idx = re_string_cur_idx (regexp);
				2483	#ifndef RE_TOKEN_INIT_BUG
				2484	re_token_t start_token = *token;
				2485	#else
				2486	re_token_t start_token;
				2487
				2488	memcpy ((void ) &start_token, (void ) token, sizeof start_token);
				2489	#endif
				2490
				2491	if (token->type == OP_OPEN_DUP_NUM)
				2492	{
				2493	end = 0;
				2494	start = fetch_number (regexp, token, syntax);
				2495	if (start == -1)
				2496	{
				2497	if (token->type == CHARACTER && token->opr.c == ',')
				2498	start = 0; /* We treat "{,m}" as "{0,m}". */
				2499	else
				2500	{
				2501	err = REG_BADBR; / <re>{} is invalid. */
				2502	return NULL;
				2503	}
				2504	}
				2505	if (BE (start != -2, 1))
				2506	{
				2507	/* We treat "{n}" as "{n,n}". */
				2508	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
				2509	: ((token->type == CHARACTER && token->opr.c == ',')
				2510	? fetch_number (regexp, token, syntax) : -2));
				2511	}
				2512	if (BE (start == -2 \|\| end == -2, 0))
				2513	{
				2514	/* Invalid sequence. */
				2515	if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
				2516	{
				2517	if (token->type == END_OF_RE)
				2518	*err = REG_EBRACE;
				2519	else
				2520	*err = REG_BADBR;
				2521
				2522	return NULL;
				2523	}
				2524
				2525	/* If the syntax bit is set, rollback. */
				2526	re_string_set_index (regexp, start_idx);
				2527	*token = start_token;
				2528	token->type = CHARACTER;
				2529	/* mb_partial and word_char bits should be already initialized by
				2530	peek_token. */
				2531	return elem;
				2532	}
				2533
				2534	if (BE ((end != -1 && start > end) \|\| token->type != OP_CLOSE_DUP_NUM, 0))
				2535	{
				2536	/* First number greater than second. */
				2537	*err = REG_BADBR;
				2538	return NULL;
				2539	}
				2540	}
				2541	else
				2542	{
				2543	start = (token->type == OP_DUP_PLUS) ? 1 : 0;
				2544	end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
				2545	}
				2546
				2547	fetch_token (token, regexp, syntax);
				2548
				2549	if (BE (elem == NULL, 0))
				2550	return NULL;
				2551	if (BE (start == 0 && end == 0, 0))
				2552	{
				2553	postorder (elem, free_tree, NULL);
				2554	return NULL;
				2555	}
				2556
				2557	/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
				2558	if (BE (start > 0, 0))
				2559	{
				2560	tree = elem;
				2561	for (i = 2; i <= start; ++i)
				2562	{
				2563	elem = duplicate_tree (elem, dfa);
				2564	tree = create_tree (dfa, tree, elem, CONCAT);
				2565	if (BE (elem == NULL \|\| tree == NULL, 0))
				2566	goto parse_dup_op_espace;
				2567	}
				2568
				2569	if (start == end)
				2570	return tree;
				2571
				2572	/* Duplicate ELEM before it is marked optional. */
				2573	elem = duplicate_tree (elem, dfa);
				2574	old_tree = tree;
				2575	}
				2576	else
				2577	old_tree = NULL;
				2578
				2579	if (elem->token.type == SUBEXP)
				2580	postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
				2581
				2582	tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
				2583	if (BE (tree == NULL, 0))
				2584	goto parse_dup_op_espace;
				2585
				2586	/* This loop is actually executed only when end != -1,
				2587	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
				2588	already created the start+1-th copy. */
				2589	for (i = start + 2; i <= end; ++i)
				2590	{
				2591	elem = duplicate_tree (elem, dfa);
				2592	tree = create_tree (dfa, tree, elem, CONCAT);
				2593	if (BE (elem == NULL \|\| tree == NULL, 0))
				2594	goto parse_dup_op_espace;
				2595
				2596	tree = create_tree (dfa, tree, NULL, OP_ALT);
				2597	if (BE (tree == NULL, 0))
				2598	goto parse_dup_op_espace;
				2599	}
				2600
				2601	if (old_tree)
				2602	tree = create_tree (dfa, old_tree, tree, CONCAT);
				2603
				2604	return tree;
				2605
				2606	parse_dup_op_espace:
				2607	*err = REG_ESPACE;
				2608	return NULL;
				2609	}
				2610
				2611	/* Size of the names for collating symbol/equivalence_class/character_class.
				2612	I'm not sure, but maybe enough. */
				2613	#define BRACKET_NAME_BUF_SIZE 32
				2614
				2615	#ifndef _LIBC
				2616	/* Local function for parse_bracket_exp only used in case of NOT _LIBC.
				2617	Build the range expression which starts from START_ELEM, and ends
				2618	at END_ELEM. The result are written to MBCSET and SBCSET.
				2619	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
				2620	mbcset->range_ends, is a pointer argument sinse we may
				2621	update it. */
				2622
				2623	static reg_errcode_t
				2624	internal_function
				2625	# ifdef RE_ENABLE_I18N
				2626	build_range_exp (bitset_t sbcset, re_charset_t mbcset, int range_alloc,
				2627	bracket_elem_t start_elem, bracket_elem_t end_elem)
				2628	# else /* not RE_ENABLE_I18N */
				2629	build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
				2630	bracket_elem_t *end_elem)
				2631	# endif /* not RE_ENABLE_I18N */
				2632	{
				2633	unsigned int start_ch, end_ch;
				2634	/* Equivalence Classes and Character Classes can't be a range start/end. */
				2635	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
				2636	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
				2637	0))
				2638	return REG_ERANGE;
				2639
				2640	/* We can handle no multi character collating elements without libc
				2641	support. */
				2642	if (BE ((start_elem->type == COLL_SYM
				2643	&& strlen ((char *) start_elem->opr.name) > 1)
				2644	\|\| (end_elem->type == COLL_SYM
				2645	&& strlen ((char *) end_elem->opr.name) > 1), 0))
				2646	return REG_ECOLLATE;
				2647
				2648	# ifdef RE_ENABLE_I18N
				2649	{
				2650	wchar_t wc;
				2651	wint_t start_wc;
				2652	wint_t end_wc;
				2653	wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
				2654
				2655	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
				2656	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
				2657	: 0));
				2658	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
				2659	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
				2660	: 0));
				2661	#ifdef GAWK
				2662	/*
				2663	* Fedora Core 2, maybe others, have broken `btowc' that returns -1
				2664	* for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
				2665	* unsigned, so we don't have sign extension problems.
				2666	*/
				2667	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
				2668	? start_ch : start_elem->opr.wch);
				2669	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
				2670	? end_ch : end_elem->opr.wch);
				2671	#else
				2672	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
				2673	? __btowc (start_ch) : start_elem->opr.wch);
				2674	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
				2675	? __btowc (end_ch) : end_elem->opr.wch);
				2676	#endif
				2677	if (start_wc == WEOF \|\| end_wc == WEOF)
				2678	return REG_ECOLLATE;
				2679	cmp_buf[0] = start_wc;
				2680	cmp_buf[4] = end_wc;
				2681	if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
				2682	return REG_ERANGE;
				2683
				2684	/* Got valid collation sequence values, add them as a new entry.
				2685	However, for !_LIBC we have no collation elements: if the
				2686	character set is single byte, the single byte character set
				2687	that we build below suffices. parse_bracket_exp passes
				2688	no MBCSET if dfa->mb_cur_max == 1. */
				2689	if (mbcset)
				2690	{
				2691	/* Check the space of the arrays. */
				2692	if (BE (*range_alloc == mbcset->nranges, 0))
				2693	{
				2694	/* There is not enough space, need realloc. */
				2695	wchar_t new_array_start, new_array_end;
				2696	int new_nranges;
				2697
				2698	/* +1 in case of mbcset->nranges is 0. */
				2699	new_nranges = 2 * mbcset->nranges + 1;
				2700	/* Use realloc since mbcset->range_starts and mbcset->range_ends
				2701	are NULL if range_alloc == 0. /
				2702	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
				2703	new_nranges);
				2704	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
				2705	new_nranges);
				2706
				2707	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
				2708	return REG_ESPACE;
				2709
				2710	mbcset->range_starts = new_array_start;
				2711	mbcset->range_ends = new_array_end;
				2712	*range_alloc = new_nranges;
				2713	}
				2714
				2715	mbcset->range_starts[mbcset->nranges] = start_wc;
				2716	mbcset->range_ends[mbcset->nranges++] = end_wc;
				2717	}
				2718
				2719	/* Build the table for single byte characters. */
				2720	for (wc = 0; wc < SBC_MAX; ++wc)
				2721	{
				2722	cmp_buf[2] = wc;
				2723	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
				2724	&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
				2725	bitset_set (sbcset, wc);
				2726	}
				2727	}
				2728	# else /* not RE_ENABLE_I18N */
				2729	{
				2730	unsigned int ch;
				2731	start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
				2732	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
				2733	: 0));
				2734	end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
				2735	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
				2736	: 0));
				2737	if (start_ch > end_ch)
				2738	return REG_ERANGE;
				2739	/* Build the table for single byte characters. */
				2740	for (ch = 0; ch < SBC_MAX; ++ch)
				2741	if (start_ch <= ch && ch <= end_ch)
				2742	bitset_set (sbcset, ch);
				2743	}
				2744	# endif /* not RE_ENABLE_I18N */
				2745	return REG_NOERROR;
				2746	}
				2747	#endif /* not _LIBC */
				2748
				2749	#ifndef _LIBC
				2750	/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
				2751	Build the collating element which is represented by NAME.
				2752	The result are written to MBCSET and SBCSET.
				2753	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
				2754	pointer argument since we may update it. */
				2755
				2756	static reg_errcode_t
				2757	internal_function
				2758	# ifdef RE_ENABLE_I18N
				2759	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
				2760	int coll_sym_alloc, const unsigned char name)
				2761	# else /* not RE_ENABLE_I18N */
				2762	build_collating_symbol (bitset_t sbcset, const unsigned char *name)
				2763	# endif /* not RE_ENABLE_I18N */
				2764	{
				2765	size_t name_len = strlen ((const char *) name);
				2766	if (BE (name_len != 1, 0))
				2767	return REG_ECOLLATE;
				2768	else
				2769	{
				2770	bitset_set (sbcset, name[0]);
				2771	return REG_NOERROR;
				2772	}
				2773	}
				2774	#endif /* not _LIBC */
				2775
				2776	/* This function parse bracket expression like "[abc]", "[a-c]",
				2777	"[[.a-a.]]" etc. */
				2778
				2779	static bin_tree_t *
				2780	parse_bracket_exp (re_string_t regexp, re_dfa_t dfa, re_token_t *token,
				2781	reg_syntax_t syntax, reg_errcode_t *err)
				2782	{
				2783	#ifdef _LIBC
				2784	const unsigned char *collseqmb;
				2785	const char *collseqwc;
				2786	uint32_t nrules;
				2787	int32_t table_size;
				2788	const int32_t *symb_table;
				2789	const unsigned char *extra;
				2790
				2791	/* Local function for parse_bracket_exp used in _LIBC environement.
				2792	Seek the collating symbol entry correspondings to NAME.
				2793	Return the index of the symbol in the SYMB_TABLE. */
				2794
				2795	auto inline int32_t
				2796	__attribute ((always_inline))
				2797	seek_collating_symbol_entry (name, name_len)
				2798	const unsigned char *name;
				2799	size_t name_len;
				2800	{
				2801	int32_t hash = elem_hash ((const char *) name, name_len);
				2802	int32_t elem = hash % table_size;
				2803	if (symb_table[2 * elem] != 0)
				2804	{
				2805	int32_t second = hash % (table_size - 2) + 1;
				2806
				2807	do
				2808	{
				2809	/* First compare the hashing value. */
				2810	if (symb_table[2 * elem] == hash
				2811	/* Compare the length of the name. */
				2812	&& name_len == extra[symb_table[2 * elem + 1]]
				2813	/* Compare the name. */
				2814	&& memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
				2815	name_len) == 0)
				2816	{
				2817	/* Yep, this is the entry. */
				2818	break;
				2819	}
				2820
				2821	/* Next entry. */
				2822	elem += second;
				2823	}
				2824	while (symb_table[2 * elem] != 0);
				2825	}
				2826	return elem;
				2827	}
				2828
				2829	/* Local function for parse_bracket_exp used in _LIBC environment.
				2830	Look up the collation sequence value of BR_ELEM.
				2831	Return the value if succeeded, UINT_MAX otherwise. */
				2832
				2833	auto inline unsigned int
				2834	__attribute ((always_inline))
				2835	lookup_collation_sequence_value (br_elem)
				2836	bracket_elem_t *br_elem;
				2837	{
				2838	if (br_elem->type == SB_CHAR)
				2839	{
				2840	/*
				2841	if (MB_CUR_MAX == 1)
				2842	*/
				2843	if (nrules == 0)
				2844	return collseqmb[br_elem->opr.ch];
				2845	else
				2846	{
				2847	wint_t wc = __btowc (br_elem->opr.ch);
				2848	return __collseq_table_lookup (collseqwc, wc);
				2849	}
				2850	}
				2851	else if (br_elem->type == MB_CHAR)
				2852	{
				2853	if (nrules != 0)
				2854	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
				2855	}
				2856	else if (br_elem->type == COLL_SYM)
				2857	{
				2858	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
				2859	if (nrules != 0)
				2860	{
				2861	int32_t elem, idx;
				2862	elem = seek_collating_symbol_entry (br_elem->opr.name,
				2863	sym_name_len);
				2864	if (symb_table[2 * elem] != 0)
				2865	{
				2866	/* We found the entry. */
				2867	idx = symb_table[2 * elem + 1];
				2868	/* Skip the name of collating element name. */
				2869	idx += 1 + extra[idx];
				2870	/* Skip the byte sequence of the collating element. */
				2871	idx += 1 + extra[idx];
				2872	/* Adjust for the alignment. */
				2873	idx = (idx + 3) & ~3;
				2874	/* Skip the multibyte collation sequence value. */
				2875	idx += sizeof (unsigned int);
				2876	/* Skip the wide char sequence of the collating element. */
				2877	idx += sizeof (unsigned int) *
				2878	(1 + (unsigned int ) (extra + idx));
				2879	/* Return the collation sequence value. */
				2880	return (unsigned int ) (extra + idx);
				2881	}
				2882	else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
				2883	{
				2884	/* No valid character. Match it as a single byte
				2885	character. */
				2886	return collseqmb[br_elem->opr.name[0]];
				2887	}
				2888	}
				2889	else if (sym_name_len == 1)
				2890	return collseqmb[br_elem->opr.name[0]];
				2891	}
				2892	return UINT_MAX;
				2893	}
				2894
				2895	/* Local function for parse_bracket_exp used in _LIBC environement.
				2896	Build the range expression which starts from START_ELEM, and ends
				2897	at END_ELEM. The result are written to MBCSET and SBCSET.
				2898	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
				2899	mbcset->range_ends, is a pointer argument sinse we may
				2900	update it. */
				2901
				2902	auto inline reg_errcode_t
				2903	__attribute ((always_inline))
				2904	build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
				2905	re_charset_t *mbcset;
				2906	int *range_alloc;
				2907	bitset_t sbcset;
				2908	bracket_elem_t start_elem, end_elem;
				2909	{
				2910	unsigned int ch;
				2911	uint32_t start_collseq;
				2912	uint32_t end_collseq;
				2913
				2914	/* Equivalence Classes and Character Classes can't be a range
				2915	start/end. */
				2916	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
				2917	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
				2918	0))
				2919	return REG_ERANGE;
				2920
				2921	start_collseq = lookup_collation_sequence_value (start_elem);
				2922	end_collseq = lookup_collation_sequence_value (end_elem);
				2923	/* Check start/end collation sequence values. */
				2924	if (BE (start_collseq == UINT_MAX \|\| end_collseq == UINT_MAX, 0))
				2925	return REG_ECOLLATE;
				2926	if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
				2927	return REG_ERANGE;
				2928
				2929	/* Got valid collation sequence values, add them as a new entry.
				2930	However, if we have no collation elements, and the character set
				2931	is single byte, the single byte character set that we
				2932	build below suffices. */
				2933	if (nrules > 0 \|\| dfa->mb_cur_max > 1)
				2934	{
				2935	/* Check the space of the arrays. */
				2936	if (BE (*range_alloc == mbcset->nranges, 0))
				2937	{
				2938	/* There is not enough space, need realloc. */
				2939	uint32_t *new_array_start;
				2940	uint32_t *new_array_end;
				2941	int new_nranges;
				2942
				2943	/* +1 in case of mbcset->nranges is 0. */
				2944	new_nranges = 2 * mbcset->nranges + 1;
				2945	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
				2946	new_nranges);
				2947	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
				2948	new_nranges);
				2949
				2950	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
				2951	return REG_ESPACE;
				2952
				2953	mbcset->range_starts = new_array_start;
				2954	mbcset->range_ends = new_array_end;
				2955	*range_alloc = new_nranges;
				2956	}
				2957
				2958	mbcset->range_starts[mbcset->nranges] = start_collseq;
				2959	mbcset->range_ends[mbcset->nranges++] = end_collseq;
				2960	}
				2961
				2962	/* Build the table for single byte characters. */
				2963	for (ch = 0; ch < SBC_MAX; ch++)
				2964	{
				2965	uint32_t ch_collseq;
				2966	/*
				2967	if (MB_CUR_MAX == 1)
				2968	*/
				2969	if (nrules == 0)
				2970	ch_collseq = collseqmb[ch];
				2971	else
				2972	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
				2973	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
				2974	bitset_set (sbcset, ch);
				2975	}
				2976	return REG_NOERROR;
				2977	}
				2978
				2979	/* Local function for parse_bracket_exp used in _LIBC environement.
				2980	Build the collating element which is represented by NAME.
				2981	The result are written to MBCSET and SBCSET.
				2982	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
				2983	pointer argument sinse we may update it. */
				2984
				2985	auto inline reg_errcode_t
				2986	__attribute ((always_inline))
				2987	build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
				2988	re_charset_t *mbcset;
				2989	int *coll_sym_alloc;
				2990	bitset_t sbcset;
				2991	const unsigned char *name;
				2992	{
				2993	int32_t elem, idx;
				2994	size_t name_len = strlen ((const char *) name);
				2995	if (nrules != 0)
				2996	{
				2997	elem = seek_collating_symbol_entry (name, name_len);
				2998	if (symb_table[2 * elem] != 0)
				2999	{
				3000	/* We found the entry. */
				3001	idx = symb_table[2 * elem + 1];
				3002	/* Skip the name of collating element name. */
				3003	idx += 1 + extra[idx];
				3004	}
				3005	else if (symb_table[2 * elem] == 0 && name_len == 1)
				3006	{
				3007	/* No valid character, treat it as a normal
				3008	character. */
				3009	bitset_set (sbcset, name[0]);
				3010	return REG_NOERROR;
				3011	}
				3012	else
				3013	return REG_ECOLLATE;
				3014
				3015	/* Got valid collation sequence, add it as a new entry. */
				3016	/* Check the space of the arrays. */
				3017	if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
				3018	{
				3019	/* Not enough, realloc it. */
				3020	/* +1 in case of mbcset->ncoll_syms is 0. */
				3021	int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
				3022	/* Use realloc since mbcset->coll_syms is NULL
				3023	if alloc == 0. /
				3024	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
				3025	new_coll_sym_alloc);
				3026	if (BE (new_coll_syms == NULL, 0))
				3027	return REG_ESPACE;
				3028	mbcset->coll_syms = new_coll_syms;
				3029	*coll_sym_alloc = new_coll_sym_alloc;
				3030	}
				3031	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
				3032	return REG_NOERROR;
				3033	}
				3034	else
				3035	{
				3036	if (BE (name_len != 1, 0))
				3037	return REG_ECOLLATE;
				3038	else
				3039	{
				3040	bitset_set (sbcset, name[0]);
				3041	return REG_NOERROR;
				3042	}
				3043	}
				3044	}
				3045	#endif
				3046
				3047	re_token_t br_token;
				3048	re_bitset_ptr_t sbcset;
				3049	#ifdef RE_ENABLE_I18N
				3050	re_charset_t *mbcset;
				3051	int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
				3052	int equiv_class_alloc = 0, char_class_alloc = 0;
				3053	#endif /* not RE_ENABLE_I18N */
				3054	int non_match = 0;
				3055	bin_tree_t *work_tree;
				3056	int token_len;
				3057	int first_round = 1;
				3058	#ifdef _LIBC
				3059	collseqmb = (const unsigned char *)
				3060	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
				3061	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
				3062	if (nrules)
				3063	{
				3064	/*
				3065	if (MB_CUR_MAX > 1)
				3066	*/
				3067	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
				3068	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
				3069	symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
				3070	_NL_COLLATE_SYMB_TABLEMB);
				3071	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
				3072	_NL_COLLATE_SYMB_EXTRAMB);
				3073	}
				3074	#endif
				3075	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
				3076	#ifdef RE_ENABLE_I18N
				3077	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
				3078	#endif /* RE_ENABLE_I18N */
				3079	#ifdef RE_ENABLE_I18N
				3080	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
				3081	#else
				3082	if (BE (sbcset == NULL, 0))
				3083	#endif /* RE_ENABLE_I18N */
				3084	{
				3085	*err = REG_ESPACE;
				3086	return NULL;
				3087	}
				3088
				3089	token_len = peek_token_bracket (token, regexp, syntax);
				3090	if (BE (token->type == END_OF_RE, 0))
				3091	{
				3092	*err = REG_BADPAT;
				3093	goto parse_bracket_exp_free_return;
				3094	}
				3095	if (token->type == OP_NON_MATCH_LIST)
				3096	{
				3097	#ifdef RE_ENABLE_I18N
				3098	mbcset->non_match = 1;
				3099	#endif /* not RE_ENABLE_I18N */
				3100	non_match = 1;
				3101	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
				3102	bitset_set (sbcset, '\n');
				3103	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
				3104	token_len = peek_token_bracket (token, regexp, syntax);
				3105	if (BE (token->type == END_OF_RE, 0))
				3106	{
				3107	*err = REG_BADPAT;
				3108	goto parse_bracket_exp_free_return;
				3109	}
				3110	}
				3111
				3112	/* We treat the first ']' as a normal character. */
				3113	if (token->type == OP_CLOSE_BRACKET)
				3114	token->type = CHARACTER;
				3115
				3116	while (1)
				3117	{
				3118	bracket_elem_t start_elem, end_elem;
				3119	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
				3120	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
				3121	reg_errcode_t ret;
				3122	int token_len2 = 0, is_range_exp = 0;
				3123	re_token_t token2;
				3124
				3125	start_elem.opr.name = start_name_buf;
				3126	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
				3127	syntax, first_round);
				3128	if (BE (ret != REG_NOERROR, 0))
				3129	{
				3130	*err = ret;
				3131	goto parse_bracket_exp_free_return;
				3132	}
				3133	first_round = 0;
				3134
				3135	/* Get information about the next token. We need it in any case. */
				3136	token_len = peek_token_bracket (token, regexp, syntax);
				3137
				3138	/* Do not check for ranges if we know they are not allowed. */
				3139	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
				3140	{
				3141	if (BE (token->type == END_OF_RE, 0))
				3142	{
				3143	*err = REG_EBRACK;
				3144	goto parse_bracket_exp_free_return;
				3145	}
				3146	if (token->type == OP_CHARSET_RANGE)
				3147	{
				3148	re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
				3149	token_len2 = peek_token_bracket (&token2, regexp, syntax);
				3150	if (BE (token2.type == END_OF_RE, 0))
				3151	{
				3152	*err = REG_EBRACK;
				3153	goto parse_bracket_exp_free_return;
				3154	}
				3155	if (token2.type == OP_CLOSE_BRACKET)
				3156	{
				3157	/* We treat the last '-' as a normal character. */
				3158	re_string_skip_bytes (regexp, -token_len);
				3159	token->type = CHARACTER;
				3160	}
				3161	else
				3162	is_range_exp = 1;
				3163	}
				3164	}
				3165
				3166	if (is_range_exp == 1)
				3167	{
				3168	end_elem.opr.name = end_name_buf;
				3169	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
				3170	dfa, syntax, 1);
				3171	if (BE (ret != REG_NOERROR, 0))
				3172	{
				3173	*err = ret;
				3174	goto parse_bracket_exp_free_return;
				3175	}
				3176
				3177	token_len = peek_token_bracket (token, regexp, syntax);
				3178
				3179	#ifdef _LIBC
				3180	*err = build_range_exp (sbcset, mbcset, &range_alloc,
				3181	&start_elem, &end_elem);
				3182	#else
				3183	# ifdef RE_ENABLE_I18N
				3184	*err = build_range_exp (sbcset,
				3185	dfa->mb_cur_max > 1 ? mbcset : NULL,
				3186	&range_alloc, &start_elem, &end_elem);
				3187	# else
				3188	*err = build_range_exp (sbcset, &start_elem, &end_elem);
				3189	# endif
				3190	#endif /* RE_ENABLE_I18N */
				3191	if (BE (*err != REG_NOERROR, 0))
				3192	goto parse_bracket_exp_free_return;
				3193	}
				3194	else
				3195	{
				3196	switch (start_elem.type)
				3197	{
				3198	case SB_CHAR:
				3199	bitset_set (sbcset, start_elem.opr.ch);
				3200	break;
				3201	#ifdef RE_ENABLE_I18N
				3202	case MB_CHAR:
				3203	/* Check whether the array has enough space. */
				3204	if (BE (mbchar_alloc == mbcset->nmbchars, 0))
				3205	{
				3206	wchar_t *new_mbchars;
				3207	/* Not enough, realloc it. */
				3208	/* +1 in case of mbcset->nmbchars is 0. */
				3209	mbchar_alloc = 2 * mbcset->nmbchars + 1;
				3210	/* Use realloc since array is NULL if alloc == 0. /
				3211	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
				3212	mbchar_alloc);
				3213	if (BE (new_mbchars == NULL, 0))
				3214	goto parse_bracket_exp_espace;
				3215	mbcset->mbchars = new_mbchars;
				3216	}
				3217	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
				3218	break;
				3219	#endif /* RE_ENABLE_I18N */
				3220	case EQUIV_CLASS:
				3221	*err = build_equiv_class (sbcset,
				3222	#ifdef RE_ENABLE_I18N
				3223	mbcset, &equiv_class_alloc,
				3224	#endif /* RE_ENABLE_I18N */
				3225	start_elem.opr.name);
				3226	if (BE (*err != REG_NOERROR, 0))
				3227	goto parse_bracket_exp_free_return;
				3228	break;
				3229	case COLL_SYM:
				3230	*err = build_collating_symbol (sbcset,
				3231	#ifdef RE_ENABLE_I18N
				3232	mbcset, &coll_sym_alloc,
				3233	#endif /* RE_ENABLE_I18N */
				3234	start_elem.opr.name);
				3235	if (BE (*err != REG_NOERROR, 0))
				3236	goto parse_bracket_exp_free_return;
				3237	break;
				3238	case CHAR_CLASS:
				3239	*err = build_charclass (regexp->trans, sbcset,
				3240	#ifdef RE_ENABLE_I18N
				3241	mbcset, &char_class_alloc,
				3242	#endif /* RE_ENABLE_I18N */
				3243	(const char *) start_elem.opr.name, syntax);
				3244	if (BE (*err != REG_NOERROR, 0))
				3245	goto parse_bracket_exp_free_return;
				3246	break;
				3247	default:
				3248	assert (0);
				3249	break;
				3250	}
				3251	}
				3252	if (BE (token->type == END_OF_RE, 0))
				3253	{
				3254	*err = REG_EBRACK;
				3255	goto parse_bracket_exp_free_return;
				3256	}
				3257	if (token->type == OP_CLOSE_BRACKET)
				3258	break;
				3259	}
				3260
				3261	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
				3262
				3263	/* If it is non-matching list. */
				3264	if (non_match)
				3265	bitset_not (sbcset);
				3266
				3267	#ifdef RE_ENABLE_I18N
				3268	/* Ensure only single byte characters are set. */
				3269	if (dfa->mb_cur_max > 1)
				3270	bitset_mask (sbcset, dfa->sb_char);
				3271
				3272	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
				3273	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
				3274	\|\| mbcset->non_match)))
				3275	{
				3276	bin_tree_t *mbc_tree;
				3277	int sbc_idx;
				3278	/* Build a tree for complex bracket. */
				3279	dfa->has_mb_node = 1;
				3280	br_token.type = COMPLEX_BRACKET;
				3281	br_token.opr.mbcset = mbcset;
				3282	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
				3283	if (BE (mbc_tree == NULL, 0))
				3284	goto parse_bracket_exp_espace;
				3285	for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
				3286	if (sbcset[sbc_idx])
				3287	break;
				3288	/* If there are no bits set in sbcset, there is no point
				3289	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
				3290	if (sbc_idx < BITSET_WORDS)
				3291	{
				3292	/* Build a tree for simple bracket. */
				3293	br_token.type = SIMPLE_BRACKET;
				3294	br_token.opr.sbcset = sbcset;
				3295	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
				3296	if (BE (work_tree == NULL, 0))
				3297	goto parse_bracket_exp_espace;
				3298
				3299	/* Then join them by ALT node. */
				3300	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
				3301	if (BE (work_tree == NULL, 0))
				3302	goto parse_bracket_exp_espace;
				3303	}
				3304	else
				3305	{
				3306	re_free (sbcset);
				3307	work_tree = mbc_tree;
				3308	}
				3309	}
				3310	else
				3311	#endif /* not RE_ENABLE_I18N */
				3312	{
				3313	#ifdef RE_ENABLE_I18N
				3314	free_charset (mbcset);
				3315	#endif
				3316	/* Build a tree for simple bracket. */
				3317	br_token.type = SIMPLE_BRACKET;
				3318	br_token.opr.sbcset = sbcset;
				3319	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
				3320	if (BE (work_tree == NULL, 0))
				3321	goto parse_bracket_exp_espace;
				3322	}
				3323	return work_tree;
				3324
				3325	parse_bracket_exp_espace:
				3326	*err = REG_ESPACE;
				3327	parse_bracket_exp_free_return:
				3328	re_free (sbcset);
				3329	#ifdef RE_ENABLE_I18N
				3330	free_charset (mbcset);
				3331	#endif /* RE_ENABLE_I18N */
				3332	return NULL;
				3333	}
				3334
				3335	/* Parse an element in the bracket expression. */
				3336
				3337	static reg_errcode_t
				3338	parse_bracket_element (bracket_elem_t elem, re_string_t regexp,
				3339	re_token_t token, int token_len, re_dfa_t dfa,
				3340	reg_syntax_t syntax, int accept_hyphen)
				3341	{
				3342	#ifdef RE_ENABLE_I18N
				3343	int cur_char_size;
				3344	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
				3345	if (cur_char_size > 1)
				3346	{
				3347	elem->type = MB_CHAR;
				3348	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
				3349	re_string_skip_bytes (regexp, cur_char_size);
				3350	return REG_NOERROR;
				3351	}
				3352	#endif /* RE_ENABLE_I18N */
				3353	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
				3354	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
				3355	\|\| token->type == OP_OPEN_EQUIV_CLASS)
				3356	return parse_bracket_symbol (elem, regexp, token);
				3357	if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
				3358	{
				3359	/* A '-' must only appear as anything but a range indicator before
				3360	the closing bracket. Everything else is an error. */
				3361	re_token_t token2;
				3362	(void) peek_token_bracket (&token2, regexp, syntax);
				3363	if (token2.type != OP_CLOSE_BRACKET)
				3364	/* The actual error value is not standardized since this whole
				3365	case is undefined. But ERANGE makes good sense. */
				3366	return REG_ERANGE;
				3367	}
				3368	elem->type = SB_CHAR;
				3369	elem->opr.ch = token->opr.c;
				3370	return REG_NOERROR;
				3371	}
				3372
				3373	/* Parse a bracket symbol in the bracket expression. Bracket symbols are
				3374	such as [:<character_class>:], [.<collating_element>.], and
				3375	[=<equivalent_class>=]. */
				3376
				3377	static reg_errcode_t
				3378	parse_bracket_symbol (bracket_elem_t elem, re_string_t regexp,
				3379	re_token_t *token)
				3380	{
				3381	unsigned char ch, delim = token->opr.c;
				3382	int i = 0;
				3383	if (re_string_eoi(regexp))
				3384	return REG_EBRACK;
				3385	for (;; ++i)
				3386	{
				3387	if (i >= BRACKET_NAME_BUF_SIZE)
				3388	return REG_EBRACK;
				3389	if (token->type == OP_OPEN_CHAR_CLASS)
				3390	ch = re_string_fetch_byte_case (regexp);
				3391	else
				3392	ch = re_string_fetch_byte (regexp);
				3393	if (re_string_eoi(regexp))
				3394	return REG_EBRACK;
				3395	if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
				3396	break;
				3397	elem->opr.name[i] = ch;
				3398	}
				3399	re_string_skip_bytes (regexp, 1);
				3400	elem->opr.name[i] = '\0';
				3401	switch (token->type)
				3402	{
				3403	case OP_OPEN_COLL_ELEM:
				3404	elem->type = COLL_SYM;
				3405	break;
				3406	case OP_OPEN_EQUIV_CLASS:
				3407	elem->type = EQUIV_CLASS;
				3408	break;
				3409	case OP_OPEN_CHAR_CLASS:
				3410	elem->type = CHAR_CLASS;
				3411	break;
				3412	default:
				3413	break;
				3414	}
				3415	return REG_NOERROR;
				3416	}
				3417
				3418	/* Helper function for parse_bracket_exp.
				3419	Build the equivalence class which is represented by NAME.
				3420	The result are written to MBCSET and SBCSET.
				3421	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
				3422	is a pointer argument sinse we may update it. */
				3423
				3424	static reg_errcode_t
				3425	#ifdef RE_ENABLE_I18N
				3426	build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
				3427	int equiv_class_alloc, const unsigned char name)
				3428	#else /* not RE_ENABLE_I18N */
				3429	build_equiv_class (bitset_t sbcset, const unsigned char *name)
				3430	#endif /* not RE_ENABLE_I18N */
				3431	{
				3432	#ifdef _LIBC
				3433	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
				3434	if (nrules != 0)
				3435	{
				3436	const int32_t table, indirect;
				3437	const unsigned char weights, extra, *cp;
				3438	unsigned char char_buf[2];
				3439	int32_t idx1, idx2;
				3440	unsigned int ch;
				3441	size_t len;
				3442	/* This #include defines a local function! */
				3443	# include <locale/weight.h>
				3444	/* Calculate the index for equivalence class. */
				3445	cp = name;
				3446	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
				3447	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
				3448	_NL_COLLATE_WEIGHTMB);
				3449	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
				3450	_NL_COLLATE_EXTRAMB);
				3451	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
				3452	_NL_COLLATE_INDIRECTMB);
				3453	idx1 = findidx (&cp);
				3454	if (BE (idx1 == 0 \|\| cp < name + strlen ((const char *) name), 0))
				3455	/* This isn't a valid character. */
				3456	return REG_ECOLLATE;
				3457
				3458	/* Build single byte matcing table for this equivalence class. */
				3459	char_buf[1] = (unsigned char) '\0';
				3460	len = weights[idx1 & 0xffffff];
				3461	for (ch = 0; ch < SBC_MAX; ++ch)
				3462	{
				3463	char_buf[0] = ch;
				3464	cp = char_buf;
				3465	idx2 = findidx (&cp);
				3466	/*
				3467	idx2 = table[ch];
				3468	*/
				3469	if (idx2 == 0)
				3470	/* This isn't a valid character. */
				3471	continue;
				3472	/* Compare only if the length matches and the collation rule
				3473	index is the same. */
				3474	if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
				3475	{
				3476	int cnt = 0;
				3477
				3478	while (cnt <= len &&
				3479	weights[(idx1 & 0xffffff) + 1 + cnt]
				3480	== weights[(idx2 & 0xffffff) + 1 + cnt])
				3481	++cnt;
				3482
				3483	if (cnt > len)
				3484	bitset_set (sbcset, ch);
				3485	}
				3486	}
				3487	/* Check whether the array has enough space. */
				3488	if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
				3489	{
				3490	/* Not enough, realloc it. */
				3491	/* +1 in case of mbcset->nequiv_classes is 0. */
				3492	int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
				3493	/* Use realloc since the array is NULL if alloc == 0. /
				3494	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
				3495	int32_t,
				3496	new_equiv_class_alloc);
				3497	if (BE (new_equiv_classes == NULL, 0))
				3498	return REG_ESPACE;
				3499	mbcset->equiv_classes = new_equiv_classes;
				3500	*equiv_class_alloc = new_equiv_class_alloc;
				3501	}
				3502	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
				3503	}
				3504	else
				3505	#endif /* _LIBC */
				3506	{
				3507	if (BE (strlen ((const char *) name) != 1, 0))
				3508	return REG_ECOLLATE;
				3509	bitset_set (sbcset, *name);
				3510	}
				3511	return REG_NOERROR;
				3512	}
				3513
				3514	/* Helper function for parse_bracket_exp.
				3515	Build the character class which is represented by NAME.
				3516	The result are written to MBCSET and SBCSET.
				3517	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
				3518	is a pointer argument sinse we may update it. */
				3519
				3520	static reg_errcode_t
				3521	#ifdef RE_ENABLE_I18N
				3522	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
				3523	re_charset_t mbcset, int char_class_alloc,
				3524	const char *class_name, reg_syntax_t syntax)
				3525	#else /* not RE_ENABLE_I18N */
				3526	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
				3527	const char *class_name, reg_syntax_t syntax)
				3528	#endif /* not RE_ENABLE_I18N */
				3529	{
				3530	int i;
				3531
				3532	/* In case of REG_ICASE "upper" and "lower" match the both of
				3533	upper and lower cases. */
				3534	if ((syntax & RE_ICASE)
				3535	&& (strcmp (class_name, "upper") == 0 \|\| strcmp (class_name, "lower") == 0))
				3536	class_name = "alpha";
				3537
				3538	#ifdef RE_ENABLE_I18N
				3539	/* Check the space of the arrays. */
				3540	if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
				3541	{
				3542	/* Not enough, realloc it. */
				3543	/* +1 in case of mbcset->nchar_classes is 0. */
				3544	int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
				3545	/* Use realloc since array is NULL if alloc == 0. /
				3546	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
				3547	new_char_class_alloc);
				3548	if (BE (new_char_classes == NULL, 0))
				3549	return REG_ESPACE;
				3550	mbcset->char_classes = new_char_classes;
				3551	*char_class_alloc = new_char_class_alloc;
				3552	}
				3553	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
				3554	#endif /* RE_ENABLE_I18N */
				3555
				3556	#define BUILD_CHARCLASS_LOOP(ctype_func) \
				3557	do { \
				3558	if (BE (trans != NULL, 0)) \
				3559	{ \
				3560	for (i = 0; i < SBC_MAX; ++i) \
				3561	if (ctype_func (i)) \
				3562	bitset_set (sbcset, trans[i]); \
				3563	} \
				3564	else \
				3565	{ \
				3566	for (i = 0; i < SBC_MAX; ++i) \
				3567	if (ctype_func (i)) \
				3568	bitset_set (sbcset, i); \
				3569	} \
				3570	} while (0)
				3571
				3572	if (strcmp (class_name, "alnum") == 0)
				3573	BUILD_CHARCLASS_LOOP (isalnum);
				3574	else if (strcmp (class_name, "cntrl") == 0)
				3575	BUILD_CHARCLASS_LOOP (iscntrl);
				3576	else if (strcmp (class_name, "lower") == 0)
				3577	BUILD_CHARCLASS_LOOP (islower);
				3578	else if (strcmp (class_name, "space") == 0)
				3579	BUILD_CHARCLASS_LOOP (isspace);
				3580	else if (strcmp (class_name, "alpha") == 0)
				3581	BUILD_CHARCLASS_LOOP (isalpha);
				3582	else if (strcmp (class_name, "digit") == 0)
				3583	BUILD_CHARCLASS_LOOP (isdigit);
				3584	else if (strcmp (class_name, "print") == 0)
				3585	BUILD_CHARCLASS_LOOP (isprint);
				3586	else if (strcmp (class_name, "upper") == 0)
				3587	BUILD_CHARCLASS_LOOP (isupper);
				3588	else if (strcmp (class_name, "blank") == 0)
				3589	#ifndef GAWK
				3590	BUILD_CHARCLASS_LOOP (isblank);
				3591	#else
				3592	/* see comments above */
				3593	BUILD_CHARCLASS_LOOP (is_blank);
				3594	#endif
				3595	else if (strcmp (class_name, "graph") == 0)
				3596	BUILD_CHARCLASS_LOOP (isgraph);
				3597	else if (strcmp (class_name, "punct") == 0)
				3598	BUILD_CHARCLASS_LOOP (ispunct);
				3599	else if (strcmp (class_name, "xdigit") == 0)
				3600	BUILD_CHARCLASS_LOOP (isxdigit);
				3601	else
				3602	return REG_ECTYPE;
				3603
				3604	return REG_NOERROR;
				3605	}
				3606
				3607	static bin_tree_t *
				3608	build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
				3609	const char *class_name,
				3610	const char *extra, int non_match,
				3611	reg_errcode_t *err)
				3612	{
				3613	re_bitset_ptr_t sbcset;
				3614	#ifdef RE_ENABLE_I18N
				3615	re_charset_t *mbcset;
				3616	int alloc = 0;
				3617	#endif /* not RE_ENABLE_I18N */
				3618	reg_errcode_t ret;
				3619	re_token_t br_token;
				3620	bin_tree_t *tree;
				3621
				3622	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
				3623	#ifdef RE_ENABLE_I18N
				3624	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
				3625	#endif /* RE_ENABLE_I18N */
				3626
				3627	#ifdef RE_ENABLE_I18N
				3628	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
				3629	#else /* not RE_ENABLE_I18N */
				3630	if (BE (sbcset == NULL, 0))
				3631	#endif /* not RE_ENABLE_I18N */
				3632	{
				3633	*err = REG_ESPACE;
				3634	return NULL;
				3635	}
				3636
				3637	if (non_match)
				3638	{
				3639	#ifdef RE_ENABLE_I18N
				3640	mbcset->non_match = 1;
				3641	#endif /* not RE_ENABLE_I18N */
				3642	}
				3643
				3644	/* We don't care the syntax in this case. */
				3645	ret = build_charclass (trans, sbcset,
				3646	#ifdef RE_ENABLE_I18N
				3647	mbcset, &alloc,
				3648	#endif /* RE_ENABLE_I18N */
				3649	class_name, 0);
				3650
				3651	if (BE (ret != REG_NOERROR, 0))
				3652	{
				3653	re_free (sbcset);
				3654	#ifdef RE_ENABLE_I18N
				3655	free_charset (mbcset);
				3656	#endif /* RE_ENABLE_I18N */
				3657	*err = ret;
				3658	return NULL;
				3659	}
				3660	/* \w match '_' also. */
				3661	for (; *extra; extra++)
				3662	bitset_set (sbcset, *extra);
				3663
				3664	/* If it is non-matching list. */
				3665	if (non_match)
				3666	bitset_not (sbcset);
				3667
				3668	#ifdef RE_ENABLE_I18N
				3669	/* Ensure only single byte characters are set. */
				3670	if (dfa->mb_cur_max > 1)
				3671	bitset_mask (sbcset, dfa->sb_char);
				3672	#endif
				3673
				3674	/* Build a tree for simple bracket. */
				3675	br_token.type = SIMPLE_BRACKET;
				3676	br_token.opr.sbcset = sbcset;
				3677	tree = create_token_tree (dfa, NULL, NULL, &br_token);
				3678	if (BE (tree == NULL, 0))
				3679	goto build_word_op_espace;
				3680
				3681	#ifdef RE_ENABLE_I18N
				3682	if (dfa->mb_cur_max > 1)
				3683	{
				3684	bin_tree_t *mbc_tree;
				3685	/* Build a tree for complex bracket. */
				3686	br_token.type = COMPLEX_BRACKET;
				3687	br_token.opr.mbcset = mbcset;
				3688	dfa->has_mb_node = 1;
				3689	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
				3690	if (BE (mbc_tree == NULL, 0))
				3691	goto build_word_op_espace;
				3692	/* Then join them by ALT node. */
				3693	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
				3694	if (BE (mbc_tree != NULL, 1))
				3695	return tree;
				3696	}
				3697	else
				3698	{
				3699	free_charset (mbcset);
				3700	return tree;
				3701	}
				3702	#else /* not RE_ENABLE_I18N */
				3703	return tree;
				3704	#endif /* not RE_ENABLE_I18N */
				3705
				3706	build_word_op_espace:
				3707	re_free (sbcset);
				3708	#ifdef RE_ENABLE_I18N
				3709	free_charset (mbcset);
				3710	#endif /* RE_ENABLE_I18N */
				3711	*err = REG_ESPACE;
				3712	return NULL;
				3713	}
				3714
				3715	/* This is intended for the expressions like "a{1,3}".
				3716	Fetch a number from `input', and return the number.
				3717	Return -1, if the number field is empty like "{,1}".
				3718	Return -2, If an error is occured. */
				3719
				3720	static int
				3721	fetch_number (re_string_t input, re_token_t token, reg_syntax_t syntax)
				3722	{
				3723	int num = -1;
				3724	unsigned char c;
				3725	while (1)
				3726	{
				3727	fetch_token (token, input, syntax);
				3728	c = token->opr.c;
				3729	if (BE (token->type == END_OF_RE, 0))
				3730	return -2;
				3731	if (token->type == OP_CLOSE_DUP_NUM \|\| c == ',')
				3732	break;
				3733	num = ((token->type != CHARACTER \|\| c < '0' \|\| '9' < c \|\| num == -2)
				3734	? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
				3735	num = (num > RE_DUP_MAX) ? -2 : num;
				3736	}
				3737	return num;
				3738	}
				3739
				3740	#ifdef RE_ENABLE_I18N
				3741	static void
				3742	free_charset (re_charset_t *cset)
				3743	{
				3744	re_free (cset->mbchars);
				3745	# ifdef _LIBC
				3746	re_free (cset->coll_syms);
				3747	re_free (cset->equiv_classes);
				3748	re_free (cset->range_starts);
				3749	re_free (cset->range_ends);
				3750	# endif
				3751	re_free (cset->char_classes);
				3752	re_free (cset);
				3753	}
				3754	#endif /* RE_ENABLE_I18N */
				3755
				3756	/* Functions for binary tree operation. */
				3757
				3758	/* Create a tree node. */
				3759
				3760	static bin_tree_t *
				3761	create_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
				3762	re_token_type_t type)
				3763	{
				3764	re_token_t t;
				3765	t.type = type;
				3766	return create_token_tree (dfa, left, right, &t);
				3767	}
				3768
				3769	static bin_tree_t *
				3770	create_token_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
				3771	const re_token_t *token)
				3772	{
				3773	bin_tree_t *tree;
				3774	if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
				3775	{
				3776	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
				3777
				3778	if (storage == NULL)
				3779	return NULL;
				3780	storage->next = dfa->str_tree_storage;
				3781	dfa->str_tree_storage = storage;
				3782	dfa->str_tree_storage_idx = 0;
				3783	}
				3784	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
				3785
				3786	tree->parent = NULL;
				3787	tree->left = left;
				3788	tree->right = right;
				3789	tree->token = *token;
				3790	tree->token.duplicated = 0;
				3791	tree->token.opt_subexp = 0;
				3792	tree->first = NULL;
				3793	tree->next = NULL;
				3794	tree->node_idx = -1;
				3795
				3796	if (left != NULL)
				3797	left->parent = tree;
				3798	if (right != NULL)
				3799	right->parent = tree;
				3800	return tree;
				3801	}
				3802
				3803	/* Mark the tree SRC as an optional subexpression.
				3804	To be called from preorder or postorder. */
				3805
				3806	static reg_errcode_t
				3807	mark_opt_subexp (void extra, bin_tree_t node)
				3808	{
				3809	int idx = (int) (long) extra;
				3810	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
				3811	node->token.opt_subexp = 1;
				3812
				3813	return REG_NOERROR;
				3814	}
				3815
				3816	/* Free the allocated memory inside NODE. */
				3817
				3818	static void
				3819	free_token (re_token_t *node)
				3820	{
				3821	#ifdef RE_ENABLE_I18N
				3822	if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
				3823	free_charset (node->opr.mbcset);
				3824	else
				3825	#endif /* RE_ENABLE_I18N */
				3826	if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
				3827	re_free (node->opr.sbcset);
				3828	}
				3829
				3830	/* Worker function for tree walking. Free the allocated memory inside NODE
				3831	and its children. */
				3832
				3833	static reg_errcode_t
				3834	free_tree (void extra, bin_tree_t node)
				3835	{
				3836	free_token (&node->token);
				3837	return REG_NOERROR;
				3838	}
				3839
				3840
				3841	/* Duplicate the node SRC, and return new node. This is a preorder
				3842	visit similar to the one implemented by the generic visitor, but
				3843	we need more infrastructure to maintain two parallel trees --- so,
				3844	it's easier to duplicate. */
				3845
				3846	static bin_tree_t *
				3847	duplicate_tree (const bin_tree_t root, re_dfa_t dfa)
				3848	{
				3849	const bin_tree_t *node;
				3850	bin_tree_t *dup_root;
				3851	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
				3852
				3853	for (node = root; ; )
				3854	{
				3855	/* Create a new tree and link it back to the current parent. */
				3856	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
				3857	if (*p_new == NULL)
				3858	return NULL;
				3859	(*p_new)->parent = dup_node;
				3860	(*p_new)->token.duplicated = 1;
				3861	dup_node = *p_new;
				3862
				3863	/* Go to the left node, or up and to the right. */
				3864	if (node->left)
				3865	{
				3866	node = node->left;
				3867	p_new = &dup_node->left;
				3868	}
				3869	else
				3870	{
				3871	const bin_tree_t *prev = NULL;
				3872	while (node->right == prev \|\| node->right == NULL)
				3873	{
				3874	prev = node;
				3875	node = node->parent;
				3876	dup_node = dup_node->parent;
				3877	if (!node)
				3878	return dup_root;
				3879	}
				3880	node = node->right;
				3881	p_new = &dup_node->right;
				3882	}
				3883	}
				3884	}