Blame - compat/regex/regcomp.c - jrn/git

blob: 51cd60baa37adbeef788098482d804c58cb9979f [file] [log] [blame]

Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	1	/* Extended regular expression matching and search library.
				2	Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
				3	This file is part of the GNU C Library.
				4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
				5
				6	The GNU C Library is free software; you can redistribute it and/or
				7	modify it under the terms of the GNU Lesser General Public
				8	License as published by the Free Software Foundation; either
				9	version 2.1 of the License, or (at your option) any later version.
				10
				11	The GNU C Library is distributed in the hope that it will be useful,
				12	but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				14	Lesser General Public License for more details.
				15
				16	You should have received a copy of the GNU Lesser General Public
Todd Zullinger	4842579	2017-11-07 00:39:33 -0500	[diff] [blame]	17	License along with the GNU C Library; if not, see
				18	<http://www.gnu.org/licenses/>. */
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	19
				20	static reg_errcode_t re_compile_internal (regex_t preg, const char pattern,
				21	size_t length, reg_syntax_t syntax);
				22	static void re_compile_fastmap_iter (regex_t *bufp,
				23	const re_dfastate_t *init_state,
				24	char *fastmap);
				25	static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
				26	#ifdef RE_ENABLE_I18N
				27	static void free_charset (re_charset_t *cset);
				28	#endif /* RE_ENABLE_I18N */
				29	static void free_workarea_compile (regex_t *preg);
				30	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
				31	#ifdef RE_ENABLE_I18N
				32	static void optimize_utf8 (re_dfa_t *dfa);
				33	#endif
				34	static reg_errcode_t analyze (regex_t *preg);
				35	static reg_errcode_t preorder (bin_tree_t *root,
				36	reg_errcode_t (fn (void , bin_tree_t )),
				37	void *extra);
				38	static reg_errcode_t postorder (bin_tree_t *root,
				39	reg_errcode_t (fn (void , bin_tree_t )),
				40	void *extra);
				41	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
				42	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
				43	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
				44	bin_tree_t *node);
				45	static reg_errcode_t calc_first (void extra, bin_tree_t node);
				46	static reg_errcode_t calc_next (void extra, bin_tree_t node);
				47	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
				48	static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
				49	static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
				50	unsigned int constraint);
				51	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
				52	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
				53	int node, int root);
				54	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
				55	static int fetch_number (re_string_t input, re_token_t token,
				56	reg_syntax_t syntax);
				57	static int peek_token (re_token_t token, re_string_t input,
				58	reg_syntax_t syntax) internal_function;
				59	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
				60	reg_syntax_t syntax, reg_errcode_t *err);
				61	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
				62	re_token_t *token, reg_syntax_t syntax,
				63	int nest, reg_errcode_t *err);
				64	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
				65	re_token_t *token, reg_syntax_t syntax,
				66	int nest, reg_errcode_t *err);
				67	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
				68	re_token_t *token, reg_syntax_t syntax,
				69	int nest, reg_errcode_t *err);
				70	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
				71	re_token_t *token, reg_syntax_t syntax,
				72	int nest, reg_errcode_t *err);
				73	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
				74	re_dfa_t dfa, re_token_t token,
				75	reg_syntax_t syntax, reg_errcode_t *err);
				76	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
				77	re_token_t *token, reg_syntax_t syntax,
				78	reg_errcode_t *err);
				79	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
				80	re_string_t *regexp,
				81	re_token_t *token, int token_len,
				82	re_dfa_t *dfa,
				83	reg_syntax_t syntax,
				84	int accept_hyphen);
				85	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
				86	re_string_t *regexp,
				87	re_token_t *token);
				88	#ifdef RE_ENABLE_I18N
				89	static reg_errcode_t build_equiv_class (bitset_t sbcset,
				90	re_charset_t *mbcset,
				91	int *equiv_class_alloc,
				92	const unsigned char *name);
				93	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
				94	bitset_t sbcset,
				95	re_charset_t *mbcset,
				96	int *char_class_alloc,
				97	const char *class_name,
				98	reg_syntax_t syntax);
				99	#else /* not RE_ENABLE_I18N */
				100	static reg_errcode_t build_equiv_class (bitset_t sbcset,
				101	const unsigned char *name);
				102	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
				103	bitset_t sbcset,
				104	const char *class_name,
				105	reg_syntax_t syntax);
				106	#endif /* not RE_ENABLE_I18N */
				107	static bin_tree_t build_charclass_op (re_dfa_t dfa,
				108	RE_TRANSLATE_TYPE trans,
				109	const char *class_name,
				110	const char *extra,
				111	int non_match, reg_errcode_t *err);
				112	static bin_tree_t create_tree (re_dfa_t dfa,
				113	bin_tree_t left, bin_tree_t right,
				114	re_token_type_t type);
				115	static bin_tree_t create_token_tree (re_dfa_t dfa,
				116	bin_tree_t left, bin_tree_t right,
				117	const re_token_t *token);
				118	static bin_tree_t duplicate_tree (const bin_tree_t src, re_dfa_t *dfa);
				119	static void free_token (re_token_t *node);
				120	static reg_errcode_t free_tree (void extra, bin_tree_t node);
				121	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
				122
				123	/* This table gives an error message for each of the error codes listed
				124	in regex.h. Obviously the order here has to be same as there.
				125	POSIX doesn't require that we do anything for REG_NOERROR,
				126	but why not be nice? */
				127
				128	const char __re_error_msgid[] attribute_hidden =
				129	{
				130	#define REG_NOERROR_IDX 0
				131	gettext_noop ("Success") /* REG_NOERROR */
				132	"\0"
				133	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
				134	gettext_noop ("No match") /* REG_NOMATCH */
				135	"\0"
				136	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
				137	gettext_noop ("Invalid regular expression") /* REG_BADPAT */
				138	"\0"
				139	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
				140	gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
				141	"\0"
				142	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
				143	gettext_noop ("Invalid character class name") /* REG_ECTYPE */
				144	"\0"
				145	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
				146	gettext_noop ("Trailing backslash") /* REG_EESCAPE */
				147	"\0"
				148	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
				149	gettext_noop ("Invalid back reference") /* REG_ESUBREG */
				150	"\0"
				151	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
				152	gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
				153	"\0"
				154	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
				155	gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
				156	"\0"
				157	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
				158	gettext_noop ("Unmatched \\{") /* REG_EBRACE */
				159	"\0"
				160	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
				161	gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
				162	"\0"
				163	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
				164	gettext_noop ("Invalid range end") /* REG_ERANGE */
				165	"\0"
				166	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
				167	gettext_noop ("Memory exhausted") /* REG_ESPACE */
				168	"\0"
				169	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
				170	gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
				171	"\0"
				172	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
				173	gettext_noop ("Premature end of regular expression") /* REG_EEND */
				174	"\0"
				175	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
				176	gettext_noop ("Regular expression too big") /* REG_ESIZE */
				177	"\0"
				178	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
				179	gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
				180	};
				181
				182	const size_t __re_error_msgid_idx[] attribute_hidden =
				183	{
				184	REG_NOERROR_IDX,
				185	REG_NOMATCH_IDX,
				186	REG_BADPAT_IDX,
				187	REG_ECOLLATE_IDX,
				188	REG_ECTYPE_IDX,
				189	REG_EESCAPE_IDX,
				190	REG_ESUBREG_IDX,
				191	REG_EBRACK_IDX,
				192	REG_EPAREN_IDX,
				193	REG_EBRACE_IDX,
				194	REG_BADBR_IDX,
				195	REG_ERANGE_IDX,
				196	REG_ESPACE_IDX,
				197	REG_BADRPT_IDX,
				198	REG_EEND_IDX,
				199	REG_ESIZE_IDX,
				200	REG_ERPAREN_IDX
				201	};
				202
				203	/* Entry points for GNU code. */
				204
				205
				206	#ifdef ZOS_USS
				207
				208	/* For ZOS USS we must define btowc */
				209
				210	wchar_t
				211	btowc (int c)
				212	{
				213	wchar_t wtmp[2];
				214	char tmp[2];
				215
				216	tmp[0] = c;
				217	tmp[1] = 0;
				218
				219	mbtowc (wtmp, tmp, 1);
				220	return wtmp[0];
				221	}
				222	#endif
				223
				224	/* re_compile_pattern is the GNU regular expression compiler: it
				225	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
				226	Returns 0 if the pattern was valid, otherwise an error string.
				227
				228	Assumes the `allocated' (and perhaps `buffer') and `translate' fields
				229	are set in BUFP on entry. */
				230
				231	const char *
Junio C Hamano	178b331	2010-08-19 15:31:30 -0700	[diff] [blame]	232	re_compile_pattern (const char *pattern,
				233	size_t length,
				234	struct re_pattern_buffer *bufp)
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	235	{
				236	reg_errcode_t ret;
				237
				238	/* And GNU code determines whether or not to get register information
				239	by passing null for the REGS argument to re_match, etc., not by
				240	setting no_sub, unless RE_NO_SUB is set. */
				241	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
				242
				243	/* Match anchors at newline. */
				244	bufp->newline_anchor = 1;
				245
				246	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
				247
				248	if (!ret)
				249	return NULL;
				250	return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
				251	}
				252	#ifdef _LIBC
				253	weak_alias (__re_compile_pattern, re_compile_pattern)
				254	#endif
				255
				256	/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
				257	also be assigned to arbitrarily: each pattern buffer stores its own
				258	syntax, so it can be changed between regex compilations. */
				259	/* This has no initializer because initialized variables in Emacs
				260	become read-only after dumping. */
				261	reg_syntax_t re_syntax_options;
				262
				263
				264	/* Specify the precise syntax of regexps for compilation. This provides
				265	for compatibility for various utilities which historically have
				266	different, incompatible syntaxes.
				267
				268	The argument SYNTAX is a bit mask comprised of the various bits
				269	defined in regex.h. We return the old syntax. */
				270
				271	reg_syntax_t
Junio C Hamano	178b331	2010-08-19 15:31:30 -0700	[diff] [blame]	272	re_set_syntax (reg_syntax_t syntax)
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	273	{
				274	reg_syntax_t ret = re_syntax_options;
				275
				276	re_syntax_options = syntax;
				277	return ret;
				278	}
				279	#ifdef _LIBC
				280	weak_alias (__re_set_syntax, re_set_syntax)
				281	#endif
				282
				283	int
Junio C Hamano	178b331	2010-08-19 15:31:30 -0700	[diff] [blame]	284	re_compile_fastmap (struct re_pattern_buffer *bufp)
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	285	{
				286	re_dfa_t dfa = (re_dfa_t ) bufp->buffer;
				287	char *fastmap = bufp->fastmap;
				288
				289	memset (fastmap, '\0', sizeof (char) * SBC_MAX);
				290	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
				291	if (dfa->init_state != dfa->init_state_word)
				292	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
				293	if (dfa->init_state != dfa->init_state_nl)
				294	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
				295	if (dfa->init_state != dfa->init_state_begbuf)
				296	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
				297	bufp->fastmap_accurate = 1;
				298	return 0;
				299	}
				300	#ifdef _LIBC
				301	weak_alias (__re_compile_fastmap, re_compile_fastmap)
				302	#endif
				303
				304	static inline void
				305	__attribute ((always_inline))
				306	re_set_fastmap (char *fastmap, int icase, int ch)
				307	{
				308	fastmap[ch] = 1;
				309	if (icase)
				310	fastmap[tolower (ch)] = 1;
				311	}
				312
				313	/* Helper function for re_compile_fastmap.
				314	Compile fastmap for the initial_state INIT_STATE. */
				315
				316	static void
				317	re_compile_fastmap_iter (regex_t bufp, const re_dfastate_t init_state,
				318	char *fastmap)
				319	{
				320	volatile re_dfa_t dfa = (re_dfa_t ) bufp->buffer;
				321	int node_cnt;
				322	int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
				323	for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
				324	{
				325	int node = init_state->nodes.elems[node_cnt];
				326	re_token_type_t type = dfa->nodes[node].type;
				327
				328	if (type == CHARACTER)
				329	{
				330	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
				331	#ifdef RE_ENABLE_I18N
				332	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
				333	{
				334	unsigned char buf = re_malloc (unsigned char, dfa->mb_cur_max), p;
				335	wchar_t wc;
				336	mbstate_t state;
				337
				338	p = buf;
				339	*p++ = dfa->nodes[node].opr.c;
				340	while (++node < dfa->nodes_len
Junio C Hamano	c01499e	2013-10-16 10:26:39 -0700	[diff] [blame]	341	&& dfa->nodes[node].type == CHARACTER
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	342	&& dfa->nodes[node].mb_partial)
				343	*p++ = dfa->nodes[node].opr.c;
				344	memset (&state, '\0', sizeof (state));
				345	if (__mbrtowc (&wc, (const char *) buf, p - buf,
				346	&state) == p - buf
				347	&& (__wcrtomb ((char *) buf, towlower (wc), &state)
				348	!= (size_t) -1))
				349	re_set_fastmap (fastmap, 0, buf[0]);
				350	re_free (buf);
				351	}
				352	#endif
				353	}
				354	else if (type == SIMPLE_BRACKET)
				355	{
				356	int i, ch;
				357	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
				358	{
				359	int j;
				360	bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
				361	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
				362	if (w & ((bitset_word_t) 1 << j))
				363	re_set_fastmap (fastmap, icase, ch);
				364	}
				365	}
				366	#ifdef RE_ENABLE_I18N
				367	else if (type == COMPLEX_BRACKET)
				368	{
				369	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
				370	int i;
				371
				372	# ifdef _LIBC
				373	/* See if we have to try all bytes which start multiple collation
				374	elements.
				375	e.g. In da_DK, we want to catch 'a' since "aa" is a valid
				376	collation element, and don't catch 'b' since 'b' is
				377	the only collation element which starts from 'b' (and
				378	it is caught by SIMPLE_BRACKET). */
				379	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
				380	&& (cset->ncoll_syms \|\| cset->nranges))
				381	{
				382	const int32_t table = (const int32_t )
				383	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
				384	for (i = 0; i < SBC_MAX; ++i)
				385	if (table[i] < 0)
				386	re_set_fastmap (fastmap, icase, i);
				387	}
				388	# endif /* _LIBC */
				389
				390	/* See if we have to start the match at all multibyte characters,
				391	i.e. where we would not find an invalid sequence. This only
				392	applies to multibyte character sets; for single byte character
				393	sets, the SIMPLE_BRACKET again suffices. */
				394	if (dfa->mb_cur_max > 1
				395	&& (cset->nchar_classes \|\| cset->non_match \|\| cset->nranges
				396	# ifdef _LIBC
				397	\|\| cset->nequiv_classes
				398	# endif /* _LIBC */
				399	))
				400	{
				401	unsigned char c = 0;
				402	do
				403	{
				404	mbstate_t mbs;
				405	memset (&mbs, 0, sizeof (mbs));
				406	if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
				407	re_set_fastmap (fastmap, false, (int) c);
				408	}
				409	while (++c != 0);
				410	}
				411
				412	else
				413	{
				414	/* ... Else catch all bytes which can start the mbchars. */
				415	for (i = 0; i < cset->nmbchars; ++i)
				416	{
				417	char buf[256];
				418	mbstate_t state;
				419	memset (&state, '\0', sizeof (state));
				420	if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
				421	re_set_fastmap (fastmap, icase, (unsigned char ) buf);
				422	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
				423	{
				424	if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
				425	!= (size_t) -1)
				426	re_set_fastmap (fastmap, false, (unsigned char ) buf);
				427	}
				428	}
				429	}
				430	}
				431	#endif /* RE_ENABLE_I18N */
				432	else if (type == OP_PERIOD
				433	#ifdef RE_ENABLE_I18N
				434	\|\| type == OP_UTF8_PERIOD
				435	#endif /* RE_ENABLE_I18N */
				436	\|\| type == END_OF_RE)
				437	{
				438	memset (fastmap, '\1', sizeof (char) * SBC_MAX);
				439	if (type == END_OF_RE)
				440	bufp->can_be_null = 1;
				441	return;
				442	}
				443	}
				444	}
				445
				446	/* Entry point for POSIX code. */
				447	/* regcomp takes a regular expression as a string and compiles it.
				448
				449	PREG is a regex_t *. We do not expect any fields to be initialized,
				450	since POSIX says we shouldn't. Thus, we set
				451
				452	`buffer' to the compiled pattern;
				453	`used' to the length of the compiled pattern;
				454	`syntax' to RE_SYNTAX_POSIX_EXTENDED if the
				455	REG_EXTENDED bit in CFLAGS is set; otherwise, to
				456	RE_SYNTAX_POSIX_BASIC;
				457	`newline_anchor' to REG_NEWLINE being set in CFLAGS;
				458	`fastmap' to an allocated space for the fastmap;
				459	`fastmap_accurate' to zero;
				460	`re_nsub' to the number of subexpressions in PATTERN.
				461
				462	PATTERN is the address of the pattern string.
				463
				464	CFLAGS is a series of bits which affect compilation.
				465
				466	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
				467	use POSIX basic syntax.
				468
				469	If REG_NEWLINE is set, then . and [^...] don't match newline.
				470	Also, regexec will try a match beginning after every newline.
				471
				472	If REG_ICASE is set, then we considers upper- and lowercase
				473	versions of letters to be equivalent when matching.
				474
				475	If REG_NOSUB is set, then when PREG is passed to regexec, that
				476	routine will report only success or failure, and nothing about the
				477	registers.
				478
				479	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
				480	the return codes and their meanings.) */
				481
				482	int
Junio C Hamano	178b331	2010-08-19 15:31:30 -0700	[diff] [blame]	483	regcomp (regex_t *__restrict preg,
				484	const char *__restrict pattern,
				485	int cflags)
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	486	{
				487	reg_errcode_t ret;
				488	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
				489	: RE_SYNTAX_POSIX_BASIC);
				490
				491	preg->buffer = NULL;
				492	preg->allocated = 0;
				493	preg->used = 0;
				494
				495	/* Try to allocate space for the fastmap. */
				496	preg->fastmap = re_malloc (char, SBC_MAX);
				497	if (BE (preg->fastmap == NULL, 0))
				498	return REG_ESPACE;
				499
				500	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : 0;
				501
				502	/* If REG_NEWLINE is set, newlines are treated differently. */
				503	if (cflags & REG_NEWLINE)
				504	{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
				505	syntax &= ~RE_DOT_NEWLINE;
				506	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
				507	/* It also changes the matching behavior. */
				508	preg->newline_anchor = 1;
				509	}
				510	else
				511	preg->newline_anchor = 0;
				512	preg->no_sub = !!(cflags & REG_NOSUB);
				513	preg->translate = NULL;
				514
				515	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
				516
				517	/* POSIX doesn't distinguish between an unmatched open-group and an
				518	unmatched close-group: both are REG_EPAREN. */
				519	if (ret == REG_ERPAREN)
				520	ret = REG_EPAREN;
				521
				522	/* We have already checked preg->fastmap != NULL. */
				523	if (BE (ret == REG_NOERROR, 1))
				524	/* Compute the fastmap now, since regexec cannot modify the pattern
				525	buffer. This function never fails in this implementation. */
				526	(void) re_compile_fastmap (preg);
				527	else
				528	{
				529	/* Some error occurred while compiling the expression. */
				530	re_free (preg->fastmap);
				531	preg->fastmap = NULL;
				532	}
				533
				534	return (int) ret;
				535	}
				536	#ifdef _LIBC
				537	weak_alias (__regcomp, regcomp)
				538	#endif
				539
				540	/* Returns a message corresponding to an error code, ERRCODE, returned
				541	from either regcomp or regexec. We don't use PREG here. */
				542
				543	size_t
Frank Li	de83172	2010-08-17 09:24:40 +0000	[diff] [blame]	544	regerror(int errcode, const regex_t *__restrict preg,
				545	char *__restrict errbuf, size_t errbuf_size)
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	546	{
				547	const char *msg;
				548	size_t msg_size;
				549
				550	if (BE (errcode < 0
				551	\|\| errcode >= (int) (sizeof (__re_error_msgid_idx)
				552	/ sizeof (__re_error_msgid_idx[0])), 0))
				553	/* Only error codes returned by the rest of the code should be passed
				554	to this routine. If we are given anything else, or if other regex
				555	code generates an invalid error code, then the program has a bug.
				556	Dump core so we can fix it. */
				557	abort ();
				558
				559	msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
				560
				561	msg_size = strlen (msg) + 1; /* Includes the null. */
				562
				563	if (BE (errbuf_size != 0, 1))
				564	{
				565	if (BE (msg_size > errbuf_size, 0))
				566	{
				567	memcpy (errbuf, msg, errbuf_size - 1);
				568	errbuf[errbuf_size - 1] = 0;
				569	}
				570	else
				571	memcpy (errbuf, msg, msg_size);
				572	}
				573
				574	return msg_size;
				575	}
				576	#ifdef _LIBC
				577	weak_alias (__regerror, regerror)
				578	#endif
				579
				580
				581	#ifdef RE_ENABLE_I18N
				582	/* This static array is used for the map to single-byte characters when
				583	UTF-8 is used. Otherwise we would allocate memory just to initialize
				584	it the same all the time. UTF-8 is the preferred encoding so this is
				585	a worthwhile optimization. */
				586	#if __GNUC__ >= 3
				587	static const bitset_t utf8_sb_map = {
				588	/* Set the first 128 bits. */
				589	[0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
				590	};
				591	#else /* ! (__GNUC__ >= 3) */
				592	static bitset_t utf8_sb_map;
				593	#endif /* __GNUC__ >= 3 */
				594	#endif /* RE_ENABLE_I18N */
				595
				596
				597	static void
				598	free_dfa_content (re_dfa_t *dfa)
				599	{
				600	int i, j;
				601
				602	if (dfa->nodes)
				603	for (i = 0; i < dfa->nodes_len; ++i)
				604	free_token (dfa->nodes + i);
				605	re_free (dfa->nexts);
				606	for (i = 0; i < dfa->nodes_len; ++i)
				607	{
				608	if (dfa->eclosures != NULL)
				609	re_node_set_free (dfa->eclosures + i);
				610	if (dfa->inveclosures != NULL)
				611	re_node_set_free (dfa->inveclosures + i);
				612	if (dfa->edests != NULL)
				613	re_node_set_free (dfa->edests + i);
				614	}
				615	re_free (dfa->edests);
				616	re_free (dfa->eclosures);
				617	re_free (dfa->inveclosures);
				618	re_free (dfa->nodes);
				619
				620	if (dfa->state_table)
				621	for (i = 0; i <= dfa->state_hash_mask; ++i)
				622	{
				623	struct re_state_table_entry *entry = dfa->state_table + i;
				624	for (j = 0; j < entry->num; ++j)
				625	{
				626	re_dfastate_t *state = entry->array[j];
				627	free_state (state);
				628	}
				629	re_free (entry->array);
				630	}
				631	re_free (dfa->state_table);
				632	#ifdef RE_ENABLE_I18N
				633	if (dfa->sb_char != utf8_sb_map)
				634	re_free (dfa->sb_char);
				635	#endif
				636	re_free (dfa->subexp_map);
				637	#ifdef DEBUG
				638	re_free (dfa->re_str);
				639	#endif
				640
				641	re_free (dfa);
				642	}
				643
				644
				645	/* Free dynamically allocated space used by PREG. */
				646
				647	void
Junio C Hamano	178b331	2010-08-19 15:31:30 -0700	[diff] [blame]	648	regfree (regex_t *preg)
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	649	{
				650	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				651	if (BE (dfa != NULL, 1))
				652	free_dfa_content (dfa);
				653	preg->buffer = NULL;
				654	preg->allocated = 0;
				655
				656	re_free (preg->fastmap);
				657	preg->fastmap = NULL;
				658
				659	re_free (preg->translate);
				660	preg->translate = NULL;
				661	}
				662	#ifdef _LIBC
				663	weak_alias (__regfree, regfree)
				664	#endif
				665
				666	/* Entry points compatible with 4.2 BSD regex library. We don't define
				667	them unless specifically requested. */
				668
				669	#if defined _REGEX_RE_COMP \|\| defined _LIBC
				670
				671	/* BSD has one and only one pattern buffer. */
				672	static struct re_pattern_buffer re_comp_buf;
				673
				674	char *
				675	# ifdef _LIBC
				676	/* Make these definitions weak in libc, so POSIX programs can redefine
				677	these names if they don't use our functions, and still use
				678	regcomp/regexec above without link errors. */
				679	weak_function
				680	# endif
				681	re_comp (s)
				682	const char *s;
				683	{
				684	reg_errcode_t ret;
				685	char *fastmap;
				686
				687	if (!s)
				688	{
				689	if (!re_comp_buf.buffer)
				690	return gettext ("No previous regular expression");
				691	return 0;
				692	}
				693
				694	if (re_comp_buf.buffer)
				695	{
				696	fastmap = re_comp_buf.fastmap;
				697	re_comp_buf.fastmap = NULL;
				698	__regfree (&re_comp_buf);
				699	memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
				700	re_comp_buf.fastmap = fastmap;
				701	}
				702
				703	if (re_comp_buf.fastmap == NULL)
				704	{
				705	re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
				706	if (re_comp_buf.fastmap == NULL)
				707	return (char *) gettext (__re_error_msgid
				708	+ __re_error_msgid_idx[(int) REG_ESPACE]);
				709	}
				710
				711	/* Since `re_exec' always passes NULL for the `regs' argument, we
				712	don't need to initialize the pattern buffer fields which affect it. */
				713
				714	/* Match anchors at newlines. */
				715	re_comp_buf.newline_anchor = 1;
				716
				717	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
				718
				719	if (!ret)
				720	return NULL;
				721
				722	/* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
				723	return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
				724	}
				725
				726	#ifdef _LIBC
				727	libc_freeres_fn (free_mem)
				728	{
				729	__regfree (&re_comp_buf);
				730	}
				731	#endif
				732
				733	#endif /* _REGEX_RE_COMP */
				734
				735	/* Internal entry point.
				736	Compile the regular expression PATTERN, whose length is LENGTH.
				737	SYNTAX indicate regular expression's syntax. */
				738
				739	static reg_errcode_t
				740	re_compile_internal (regex_t preg, const char pattern, size_t length,
				741	reg_syntax_t syntax)
				742	{
				743	reg_errcode_t err = REG_NOERROR;
				744	re_dfa_t *dfa;
				745	re_string_t regexp;
				746
				747	/* Initialize the pattern buffer. */
				748	preg->fastmap_accurate = 0;
				749	preg->syntax = syntax;
				750	preg->not_bol = preg->not_eol = 0;
				751	preg->used = 0;
				752	preg->re_nsub = 0;
				753	preg->can_be_null = 0;
				754	preg->regs_allocated = REGS_UNALLOCATED;
				755
				756	/* Initialize the dfa. */
				757	dfa = (re_dfa_t *) preg->buffer;
				758	if (BE (preg->allocated < sizeof (re_dfa_t), 0))
				759	{
				760	/* If zero allocated, but buffer is non-null, try to realloc
				761	enough space. This loses if buffer's address is bogus, but
				762	that is the user's responsibility. If ->buffer is NULL this
				763	is a simple allocation. */
				764	dfa = re_realloc (preg->buffer, re_dfa_t, 1);
				765	if (dfa == NULL)
				766	return REG_ESPACE;
				767	preg->allocated = sizeof (re_dfa_t);
				768	preg->buffer = (unsigned char *) dfa;
				769	}
				770	preg->used = sizeof (re_dfa_t);
				771
				772	err = init_dfa (dfa, length);
				773	if (BE (err != REG_NOERROR, 0))
				774	{
				775	free_dfa_content (dfa);
				776	preg->buffer = NULL;
				777	preg->allocated = 0;
				778	return err;
				779	}
				780	#ifdef DEBUG
				781	/* Note: length+1 will not overflow since it is checked in init_dfa. */
				782	dfa->re_str = re_malloc (char, length + 1);
				783	strncpy (dfa->re_str, pattern, length + 1);
				784	#endif
				785
				786	__libc_lock_init (dfa->lock);
				787
				788	err = re_string_construct (&regexp, pattern, length, preg->translate,
				789	syntax & RE_ICASE, dfa);
				790	if (BE (err != REG_NOERROR, 0))
				791	{
				792	re_compile_internal_free_return:
				793	free_workarea_compile (preg);
				794	re_string_destruct (&regexp);
				795	free_dfa_content (dfa);
				796	preg->buffer = NULL;
				797	preg->allocated = 0;
				798	return err;
				799	}
				800
				801	/* Parse the regular expression, and build a structure tree. */
				802	preg->re_nsub = 0;
				803	dfa->str_tree = parse (&regexp, preg, syntax, &err);
				804	if (BE (dfa->str_tree == NULL, 0))
				805	goto re_compile_internal_free_return;
				806
				807	/* Analyze the tree and create the nfa. */
				808	err = analyze (preg);
				809	if (BE (err != REG_NOERROR, 0))
				810	goto re_compile_internal_free_return;
				811
				812	#ifdef RE_ENABLE_I18N
				813	/* If possible, do searching in single byte encoding to speed things up. */
				814	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
				815	optimize_utf8 (dfa);
				816	#endif
				817
				818	/* Then create the initial state of the dfa. */
				819	err = create_initial_state (dfa);
				820
				821	/* Release work areas. */
				822	free_workarea_compile (preg);
				823	re_string_destruct (&regexp);
				824
				825	if (BE (err != REG_NOERROR, 0))
				826	{
				827	free_dfa_content (dfa);
				828	preg->buffer = NULL;
				829	preg->allocated = 0;
				830	}
				831
				832	return err;
				833	}
				834
				835	/* Initialize DFA. We use the length of the regular expression PAT_LEN
				836	as the initial length of some arrays. */
				837
				838	static reg_errcode_t
				839	init_dfa (re_dfa_t *dfa, size_t pat_len)
				840	{
				841	unsigned int table_size;
				842	#ifndef _LIBC
				843	char *codeset_name;
				844	#endif
				845
				846	memset (dfa, '\0', sizeof (re_dfa_t));
				847
				848	/* Force allocation of str_tree_storage the first time. */
				849	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
				850
				851	/* Avoid overflows. */
				852	if (pat_len == SIZE_MAX)
				853	return REG_ESPACE;
				854
				855	dfa->nodes_alloc = pat_len + 1;
				856	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
				857
				858	/* table_size = 2 ^ ceil(log pat_len) */
				859	for (table_size = 1; ; table_size <<= 1)
				860	if (table_size > pat_len)
				861	break;
				862
				863	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
				864	dfa->state_hash_mask = table_size - 1;
				865
				866	dfa->mb_cur_max = MB_CUR_MAX;
				867	#ifdef _LIBC
				868	if (dfa->mb_cur_max == 6
				869	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
				870	dfa->is_utf8 = 1;
				871	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
				872	!= 0);
				873	#else
				874	# ifdef HAVE_LANGINFO_CODESET
				875	codeset_name = nl_langinfo (CODESET);
				876	# else
				877	codeset_name = getenv ("LC_ALL");
				878	if (codeset_name == NULL \|\| codeset_name[0] == '\0')
				879	codeset_name = getenv ("LC_CTYPE");
				880	if (codeset_name == NULL \|\| codeset_name[0] == '\0')
				881	codeset_name = getenv ("LANG");
				882	if (codeset_name == NULL)
				883	codeset_name = "";
				884	else if (strchr (codeset_name, '.') != NULL)
				885	codeset_name = strchr (codeset_name, '.') + 1;
				886	# endif
				887
				888	/* strcasecmp isn't a standard interface. brute force check */
				889	#if 0
				890	if (strcasecmp (codeset_name, "UTF-8") == 0
				891	\|\| strcasecmp (codeset_name, "UTF8") == 0)
				892	dfa->is_utf8 = 1;
				893	#else
				894	if ( (codeset_name[0] == 'U' \|\| codeset_name[0] == 'u')
				895	&& (codeset_name[1] == 'T' \|\| codeset_name[1] == 't')
				896	&& (codeset_name[2] == 'F' \|\| codeset_name[2] == 'f')
				897	&& (codeset_name[3] == '-'
				898	? codeset_name[4] == '8' && codeset_name[5] == '\0'
				899	: codeset_name[3] == '8' && codeset_name[4] == '\0'))
				900	dfa->is_utf8 = 1;
				901	#endif
				902
				903	/* We check exhaustively in the loop below if this charset is a
				904	superset of ASCII. */
				905	dfa->map_notascii = 0;
				906	#endif
				907
				908	#ifdef RE_ENABLE_I18N
				909	if (dfa->mb_cur_max > 1)
				910	{
				911	if (dfa->is_utf8)
				912	{
				913	#if !defined(__GNUC__) \|\| __GNUC__ < 3
				914	static short utf8_sb_map_inited = 0;
				915
				916	if (! utf8_sb_map_inited)
				917	{
				918	int i;
				919
				920	utf8_sb_map_inited = 0;
				921	for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
				922	utf8_sb_map[i] = BITSET_WORD_MAX;
				923	}
				924	#endif
				925	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
				926	}
				927	else
				928	{
				929	int i, j, ch;
				930
				931	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
				932	if (BE (dfa->sb_char == NULL, 0))
				933	return REG_ESPACE;
				934
				935	/* Set the bits corresponding to single byte chars. */
				936	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
				937	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
				938	{
				939	wint_t wch = __btowc (ch);
				940	if (wch != WEOF)
				941	dfa->sb_char[i] \|= (bitset_word_t) 1 << j;
				942	# ifndef _LIBC
				943	if (isascii (ch) && wch != ch)
				944	dfa->map_notascii = 1;
				945	# endif
				946	}
				947	}
				948	}
				949	#endif
				950
				951	if (BE (dfa->nodes == NULL \|\| dfa->state_table == NULL, 0))
				952	return REG_ESPACE;
				953	return REG_NOERROR;
				954	}
				955
				956	/* Initialize WORD_CHAR table, which indicate which character is
				957	"word". In this case "word" means that it is the word construction
				958	character used by some operators like "\<", "\>", etc. */
				959
				960	static void
				961	internal_function
				962	init_word_char (re_dfa_t *dfa)
				963	{
				964	int i, j, ch;
				965	dfa->word_ops_used = 1;
				966	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
				967	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
				968	if (isalnum (ch) \|\| ch == '_')
				969	dfa->word_char[i] \|= (bitset_word_t) 1 << j;
				970	}
				971
				972	/* Free the work area which are only used while compiling. */
				973
				974	static void
				975	free_workarea_compile (regex_t *preg)
				976	{
				977	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				978	bin_tree_storage_t storage, next;
				979	for (storage = dfa->str_tree_storage; storage; storage = next)
				980	{
				981	next = storage->next;
				982	re_free (storage);
				983	}
				984	dfa->str_tree_storage = NULL;
				985	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
				986	dfa->str_tree = NULL;
				987	re_free (dfa->org_indices);
				988	dfa->org_indices = NULL;
				989	}
				990
				991	/* Create initial states for all contexts. */
				992
				993	static reg_errcode_t
				994	create_initial_state (re_dfa_t *dfa)
				995	{
				996	int first, i;
				997	reg_errcode_t err;
				998	re_node_set init_nodes;
				999
				1000	/* Initial states have the epsilon closure of the node which is
				1001	the first node of the regular expression. */
				1002	first = dfa->str_tree->first->node_idx;
				1003	dfa->init_node = first;
				1004	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
				1005	if (BE (err != REG_NOERROR, 0))
				1006	return err;
				1007
				1008	/* The back-references which are in initial states can epsilon transit,
				1009	since in this case all of the subexpressions can be null.
				1010	Then we add epsilon closures of the nodes which are the next nodes of
				1011	the back-references. */
				1012	if (dfa->nbackref > 0)
				1013	for (i = 0; i < init_nodes.nelem; ++i)
				1014	{
				1015	int node_idx = init_nodes.elems[i];
				1016	re_token_type_t type = dfa->nodes[node_idx].type;
				1017
				1018	int clexp_idx;
				1019	if (type != OP_BACK_REF)
				1020	continue;
				1021	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
				1022	{
				1023	re_token_t *clexp_node;
				1024	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
				1025	if (clexp_node->type == OP_CLOSE_SUBEXP
				1026	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
				1027	break;
				1028	}
				1029	if (clexp_idx == init_nodes.nelem)
				1030	continue;
				1031
				1032	if (type == OP_BACK_REF)
				1033	{
				1034	int dest_idx = dfa->edests[node_idx].elems[0];
				1035	if (!re_node_set_contains (&init_nodes, dest_idx))
				1036	{
				1037	reg_errcode_t err = re_node_set_merge (&init_nodes,
				1038	dfa->eclosures
				1039	+ dest_idx);
				1040	if (err != REG_NOERROR)
				1041	return err;
				1042	i = 0;
				1043	}
				1044	}
				1045	}
				1046
				1047	/* It must be the first time to invoke acquire_state. */
				1048	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
				1049	/* We don't check ERR here, since the initial state must not be NULL. */
				1050	if (BE (dfa->init_state == NULL, 0))
				1051	return err;
				1052	if (dfa->init_state->has_constraint)
				1053	{
				1054	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
				1055	CONTEXT_WORD);
				1056	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
				1057	CONTEXT_NEWLINE);
				1058	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
				1059	&init_nodes,
				1060	CONTEXT_NEWLINE
				1061	\| CONTEXT_BEGBUF);
				1062	if (BE (dfa->init_state_word == NULL \|\| dfa->init_state_nl == NULL
				1063	\|\| dfa->init_state_begbuf == NULL, 0))
				1064	return err;
				1065	}
				1066	else
				1067	dfa->init_state_word = dfa->init_state_nl
				1068	= dfa->init_state_begbuf = dfa->init_state;
				1069
				1070	re_node_set_free (&init_nodes);
				1071	return REG_NOERROR;
				1072	}
				1073
				1074	#ifdef RE_ENABLE_I18N
				1075	/* If it is possible to do searching in single byte encoding instead of UTF-8
				1076	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
				1077	DFA nodes where needed. */
				1078
				1079	static void
				1080	optimize_utf8 (re_dfa_t *dfa)
				1081	{
				1082	int node, i, mb_chars = 0, has_period = 0;
				1083
				1084	for (node = 0; node < dfa->nodes_len; ++node)
				1085	switch (dfa->nodes[node].type)
				1086	{
				1087	case CHARACTER:
				1088	if (dfa->nodes[node].opr.c >= 0x80)
				1089	mb_chars = 1;
				1090	break;
				1091	case ANCHOR:
				1092	switch (dfa->nodes[node].opr.ctx_type)
				1093	{
				1094	case LINE_FIRST:
				1095	case LINE_LAST:
				1096	case BUF_FIRST:
				1097	case BUF_LAST:
				1098	break;
				1099	default:
				1100	/* Word anchors etc. cannot be handled. It's okay to test
				1101	opr.ctx_type since constraints (for all DFA nodes) are
				1102	created by ORing one or more opr.ctx_type values. */
				1103	return;
				1104	}
				1105	break;
				1106	case OP_PERIOD:
				1107	has_period = 1;
				1108	break;
				1109	case OP_BACK_REF:
				1110	case OP_ALT:
				1111	case END_OF_RE:
				1112	case OP_DUP_ASTERISK:
				1113	case OP_OPEN_SUBEXP:
				1114	case OP_CLOSE_SUBEXP:
				1115	break;
				1116	case COMPLEX_BRACKET:
				1117	return;
				1118	case SIMPLE_BRACKET:
				1119	/* Just double check. The non-ASCII range starts at 0x80. */
				1120	assert (0x80 % BITSET_WORD_BITS == 0);
				1121	for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
				1122	if (dfa->nodes[node].opr.sbcset[i])
				1123	return;
				1124	break;
				1125	default:
				1126	abort ();
				1127	}
				1128
				1129	if (mb_chars \|\| has_period)
				1130	for (node = 0; node < dfa->nodes_len; ++node)
				1131	{
				1132	if (dfa->nodes[node].type == CHARACTER
				1133	&& dfa->nodes[node].opr.c >= 0x80)
				1134	dfa->nodes[node].mb_partial = 0;
				1135	else if (dfa->nodes[node].type == OP_PERIOD)
				1136	dfa->nodes[node].type = OP_UTF8_PERIOD;
				1137	}
				1138
				1139	/* The search can be in single byte locale. */
				1140	dfa->mb_cur_max = 1;
				1141	dfa->is_utf8 = 0;
				1142	dfa->has_mb_node = dfa->nbackref > 0 \|\| has_period;
				1143	}
				1144	#endif
				1145
				1146	/* Analyze the structure tree, and calculate "first", "next", "edest",
				1147	"eclosure", and "inveclosure". */
				1148
				1149	static reg_errcode_t
				1150	analyze (regex_t *preg)
				1151	{
				1152	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				1153	reg_errcode_t ret;
				1154
				1155	/* Allocate arrays. */
				1156	dfa->nexts = re_malloc (int, dfa->nodes_alloc);
				1157	dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
				1158	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
				1159	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
				1160	if (BE (dfa->nexts == NULL \|\| dfa->org_indices == NULL \|\| dfa->edests == NULL
				1161	\|\| dfa->eclosures == NULL, 0))
				1162	return REG_ESPACE;
				1163
				1164	dfa->subexp_map = re_malloc (int, preg->re_nsub);
				1165	if (dfa->subexp_map != NULL)
				1166	{
				1167	int i;
				1168	for (i = 0; i < preg->re_nsub; i++)
				1169	dfa->subexp_map[i] = i;
				1170	preorder (dfa->str_tree, optimize_subexps, dfa);
				1171	for (i = 0; i < preg->re_nsub; i++)
				1172	if (dfa->subexp_map[i] != i)
				1173	break;
				1174	if (i == preg->re_nsub)
				1175	{
				1176	free (dfa->subexp_map);
				1177	dfa->subexp_map = NULL;
				1178	}
				1179	}
				1180
				1181	ret = postorder (dfa->str_tree, lower_subexps, preg);
				1182	if (BE (ret != REG_NOERROR, 0))
				1183	return ret;
				1184	ret = postorder (dfa->str_tree, calc_first, dfa);
				1185	if (BE (ret != REG_NOERROR, 0))
				1186	return ret;
				1187	preorder (dfa->str_tree, calc_next, dfa);
				1188	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
				1189	if (BE (ret != REG_NOERROR, 0))
				1190	return ret;
				1191	ret = calc_eclosure (dfa);
				1192	if (BE (ret != REG_NOERROR, 0))
				1193	return ret;
				1194
				1195	/* We only need this during the prune_impossible_nodes pass in regexec.c;
				1196	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
				1197	if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
				1198	\|\| dfa->nbackref)
				1199	{
				1200	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
				1201	if (BE (dfa->inveclosures == NULL, 0))
				1202	return REG_ESPACE;
				1203	ret = calc_inveclosure (dfa);
				1204	}
				1205
				1206	return ret;
				1207	}
				1208
				1209	/* Our parse trees are very unbalanced, so we cannot use a stack to
				1210	implement parse tree visits. Instead, we use parent pointers and
				1211	some hairy code in these two functions. */
				1212	static reg_errcode_t
				1213	postorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
				1214	void *extra)
				1215	{
				1216	bin_tree_t node, prev;
				1217
				1218	for (node = root; ; )
				1219	{
				1220	/* Descend down the tree, preferably to the left (or to the right
				1221	if that's the only child). */
				1222	while (node->left \|\| node->right)
				1223	if (node->left)
				1224	node = node->left;
				1225	else
				1226	node = node->right;
				1227
				1228	do
				1229	{
				1230	reg_errcode_t err = fn (extra, node);
				1231	if (BE (err != REG_NOERROR, 0))
				1232	return err;
				1233	if (node->parent == NULL)
				1234	return REG_NOERROR;
				1235	prev = node;
				1236	node = node->parent;
				1237	}
				1238	/* Go up while we have a node that is reached from the right. */
				1239	while (node->right == prev \|\| node->right == NULL);
				1240	node = node->right;
				1241	}
				1242	}
				1243
				1244	static reg_errcode_t
				1245	preorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
				1246	void *extra)
				1247	{
				1248	bin_tree_t *node;
				1249
				1250	for (node = root; ; )
				1251	{
				1252	reg_errcode_t err = fn (extra, node);
				1253	if (BE (err != REG_NOERROR, 0))
				1254	return err;
				1255
				1256	/* Go to the left node, or up and to the right. */
				1257	if (node->left)
				1258	node = node->left;
				1259	else
				1260	{
				1261	bin_tree_t *prev = NULL;
				1262	while (node->right == prev \|\| node->right == NULL)
				1263	{
				1264	prev = node;
				1265	node = node->parent;
				1266	if (!node)
				1267	return REG_NOERROR;
				1268	}
				1269	node = node->right;
				1270	}
				1271	}
				1272	}
				1273
				1274	/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
				1275	re_search_internal to map the inner one's opr.idx to this one's. Adjust
				1276	backreferences as well. Requires a preorder visit. */
				1277	static reg_errcode_t
				1278	optimize_subexps (void extra, bin_tree_t node)
				1279	{
				1280	re_dfa_t dfa = (re_dfa_t ) extra;
				1281
				1282	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
				1283	{
				1284	int idx = node->token.opr.idx;
				1285	node->token.opr.idx = dfa->subexp_map[idx];
				1286	dfa->used_bkref_map \|= 1 << node->token.opr.idx;
				1287	}
				1288
				1289	else if (node->token.type == SUBEXP
				1290	&& node->left && node->left->token.type == SUBEXP)
				1291	{
				1292	int other_idx = node->left->token.opr.idx;
				1293
				1294	node->left = node->left->left;
				1295	if (node->left)
				1296	node->left->parent = node;
				1297
				1298	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
				1299	if (other_idx < BITSET_WORD_BITS)
				1300	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
				1301	}
				1302
				1303	return REG_NOERROR;
				1304	}
				1305
				1306	/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
				1307	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
				1308	static reg_errcode_t
				1309	lower_subexps (void extra, bin_tree_t node)
				1310	{
				1311	regex_t preg = (regex_t ) extra;
				1312	reg_errcode_t err = REG_NOERROR;
				1313
				1314	if (node->left && node->left->token.type == SUBEXP)
				1315	{
				1316	node->left = lower_subexp (&err, preg, node->left);
				1317	if (node->left)
				1318	node->left->parent = node;
				1319	}
				1320	if (node->right && node->right->token.type == SUBEXP)
				1321	{
				1322	node->right = lower_subexp (&err, preg, node->right);
				1323	if (node->right)
				1324	node->right->parent = node;
				1325	}
				1326
				1327	return err;
				1328	}
				1329
				1330	static bin_tree_t *
				1331	lower_subexp (reg_errcode_t err, regex_t preg, bin_tree_t *node)
				1332	{
				1333	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				1334	bin_tree_t *body = node->left;
				1335	bin_tree_t op, cls, tree1, tree;
				1336
				1337	if (preg->no_sub
				1338	/* We do not optimize empty subexpressions, because otherwise we may
				1339	have bad CONCAT nodes with NULL children. This is obviously not
				1340	very common, so we do not lose much. An example that triggers
				1341	this case is the sed "script" //x. */
				1342	&& node->left != NULL
				1343	&& (node->token.opr.idx >= BITSET_WORD_BITS
				1344	\|\| !(dfa->used_bkref_map
				1345	& ((bitset_word_t) 1 << node->token.opr.idx))))
				1346	return node->left;
				1347
				1348	/* Convert the SUBEXP node to the concatenation of an
				1349	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
				1350	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
				1351	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
				1352	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
				1353	tree = create_tree (dfa, op, tree1, CONCAT);
				1354	if (BE (tree == NULL \|\| tree1 == NULL \|\| op == NULL \|\| cls == NULL, 0))
				1355	{
				1356	*err = REG_ESPACE;
				1357	return NULL;
				1358	}
				1359
				1360	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
				1361	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
				1362	return tree;
				1363	}
				1364
				1365	/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
				1366	nodes. Requires a postorder visit. */
				1367	static reg_errcode_t
				1368	calc_first (void extra, bin_tree_t node)
				1369	{
				1370	re_dfa_t dfa = (re_dfa_t ) extra;
				1371	if (node->token.type == CONCAT)
				1372	{
				1373	node->first = node->left->first;
				1374	node->node_idx = node->left->node_idx;
				1375	}
				1376	else
				1377	{
				1378	node->first = node;
				1379	node->node_idx = re_dfa_add_node (dfa, node->token);
				1380	if (BE (node->node_idx == -1, 0))
				1381	return REG_ESPACE;
				1382	if (node->token.type == ANCHOR)
				1383	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
				1384	}
				1385	return REG_NOERROR;
				1386	}
				1387
				1388	/* Pass 2: compute NEXT on the tree. Preorder visit. */
				1389	static reg_errcode_t
				1390	calc_next (void extra, bin_tree_t node)
				1391	{
				1392	switch (node->token.type)
				1393	{
				1394	case OP_DUP_ASTERISK:
				1395	node->left->next = node;
				1396	break;
				1397	case CONCAT:
				1398	node->left->next = node->right->first;
				1399	node->right->next = node->next;
				1400	break;
				1401	default:
				1402	if (node->left)
				1403	node->left->next = node->next;
				1404	if (node->right)
				1405	node->right->next = node->next;
				1406	break;
				1407	}
				1408	return REG_NOERROR;
				1409	}
				1410
				1411	/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
				1412	static reg_errcode_t
				1413	link_nfa_nodes (void extra, bin_tree_t node)
				1414	{
				1415	re_dfa_t dfa = (re_dfa_t ) extra;
				1416	int idx = node->node_idx;
				1417	reg_errcode_t err = REG_NOERROR;
				1418
				1419	switch (node->token.type)
				1420	{
				1421	case CONCAT:
				1422	break;
				1423
				1424	case END_OF_RE:
				1425	assert (node->next == NULL);
				1426	break;
				1427
				1428	case OP_DUP_ASTERISK:
				1429	case OP_ALT:
				1430	{
				1431	int left, right;
				1432	dfa->has_plural_match = 1;
				1433	if (node->left != NULL)
				1434	left = node->left->first->node_idx;
				1435	else
				1436	left = node->next->node_idx;
				1437	if (node->right != NULL)
				1438	right = node->right->first->node_idx;
				1439	else
				1440	right = node->next->node_idx;
				1441	assert (left > -1);
				1442	assert (right > -1);
				1443	err = re_node_set_init_2 (dfa->edests + idx, left, right);
				1444	}
				1445	break;
				1446
				1447	case ANCHOR:
				1448	case OP_OPEN_SUBEXP:
				1449	case OP_CLOSE_SUBEXP:
				1450	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
				1451	break;
				1452
				1453	case OP_BACK_REF:
				1454	dfa->nexts[idx] = node->next->node_idx;
				1455	if (node->token.type == OP_BACK_REF)
				1456	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
				1457	break;
				1458
				1459	default:
				1460	assert (!IS_EPSILON_NODE (node->token.type));
				1461	dfa->nexts[idx] = node->next->node_idx;
				1462	break;
				1463	}
				1464
				1465	return err;
				1466	}
				1467
				1468	/* Duplicate the epsilon closure of the node ROOT_NODE.
				1469	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
				1470	to their own constraint. */
				1471
				1472	static reg_errcode_t
				1473	internal_function
				1474	duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
				1475	int root_node, unsigned int init_constraint)
				1476	{
				1477	int org_node, clone_node, ret;
				1478	unsigned int constraint = init_constraint;
				1479	for (org_node = top_org_node, clone_node = top_clone_node;;)
				1480	{
				1481	int org_dest, clone_dest;
				1482	if (dfa->nodes[org_node].type == OP_BACK_REF)
				1483	{
				1484	/* If the back reference epsilon-transit, its destination must
				1485	also have the constraint. Then duplicate the epsilon closure
				1486	of the destination of the back reference, and store it in
				1487	edests of the back reference. */
				1488	org_dest = dfa->nexts[org_node];
				1489	re_node_set_empty (dfa->edests + clone_node);
				1490	clone_dest = duplicate_node (dfa, org_dest, constraint);
				1491	if (BE (clone_dest == -1, 0))
				1492	return REG_ESPACE;
				1493	dfa->nexts[clone_node] = dfa->nexts[org_node];
				1494	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
				1495	if (BE (ret < 0, 0))
				1496	return REG_ESPACE;
				1497	}
				1498	else if (dfa->edests[org_node].nelem == 0)
				1499	{
				1500	/* In case of the node can't epsilon-transit, don't duplicate the
				1501	destination and store the original destination as the
				1502	destination of the node. */
				1503	dfa->nexts[clone_node] = dfa->nexts[org_node];
				1504	break;
				1505	}
				1506	else if (dfa->edests[org_node].nelem == 1)
				1507	{
				1508	/* In case of the node can epsilon-transit, and it has only one
				1509	destination. */
				1510	org_dest = dfa->edests[org_node].elems[0];
				1511	re_node_set_empty (dfa->edests + clone_node);
				1512	/* If the node is root_node itself, it means the epsilon clsoure
				1513	has a loop. Then tie it to the destination of the root_node. */
				1514	if (org_node == root_node && clone_node != org_node)
				1515	{
				1516	ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
				1517	if (BE (ret < 0, 0))
				1518	return REG_ESPACE;
				1519	break;
				1520	}
				1521	/* In case of the node has another constraint, add it. */
				1522	constraint \|= dfa->nodes[org_node].constraint;
				1523	clone_dest = duplicate_node (dfa, org_dest, constraint);
				1524	if (BE (clone_dest == -1, 0))
				1525	return REG_ESPACE;
				1526	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
				1527	if (BE (ret < 0, 0))
				1528	return REG_ESPACE;
				1529	}
				1530	else /* dfa->edests[org_node].nelem == 2 */
				1531	{
				1532	/* In case of the node can epsilon-transit, and it has two
				1533	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
				1534	org_dest = dfa->edests[org_node].elems[0];
				1535	re_node_set_empty (dfa->edests + clone_node);
				1536	/* Search for a duplicated node which satisfies the constraint. */
				1537	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
				1538	if (clone_dest == -1)
				1539	{
				1540	/* There is no such duplicated node, create a new one. */
				1541	reg_errcode_t err;
				1542	clone_dest = duplicate_node (dfa, org_dest, constraint);
				1543	if (BE (clone_dest == -1, 0))
				1544	return REG_ESPACE;
				1545	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
				1546	if (BE (ret < 0, 0))
				1547	return REG_ESPACE;
				1548	err = duplicate_node_closure (dfa, org_dest, clone_dest,
				1549	root_node, constraint);
				1550	if (BE (err != REG_NOERROR, 0))
				1551	return err;
				1552	}
				1553	else
				1554	{
				1555	/* There is a duplicated node which satisfies the constraint,
				1556	use it to avoid infinite loop. */
				1557	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
				1558	if (BE (ret < 0, 0))
				1559	return REG_ESPACE;
				1560	}
				1561
				1562	org_dest = dfa->edests[org_node].elems[1];
				1563	clone_dest = duplicate_node (dfa, org_dest, constraint);
				1564	if (BE (clone_dest == -1, 0))
				1565	return REG_ESPACE;
				1566	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
				1567	if (BE (ret < 0, 0))
				1568	return REG_ESPACE;
				1569	}
				1570	org_node = org_dest;
				1571	clone_node = clone_dest;
				1572	}
				1573	return REG_NOERROR;
				1574	}
				1575
				1576	/* Search for a node which is duplicated from the node ORG_NODE, and
				1577	satisfies the constraint CONSTRAINT. */
				1578
				1579	static int
				1580	search_duplicated_node (const re_dfa_t *dfa, int org_node,
				1581	unsigned int constraint)
				1582	{
				1583	int idx;
				1584	for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
				1585	{
				1586	if (org_node == dfa->org_indices[idx]
				1587	&& constraint == dfa->nodes[idx].constraint)
				1588	return idx; /* Found. */
				1589	}
				1590	return -1; /* Not found. */
				1591	}
				1592
				1593	/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
				1594	Return the index of the new node, or -1 if insufficient storage is
				1595	available. */
				1596
				1597	static int
				1598	duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
				1599	{
				1600	int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
				1601	if (BE (dup_idx != -1, 1))
				1602	{
				1603	dfa->nodes[dup_idx].constraint = constraint;
				1604	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].constraint;
				1605	dfa->nodes[dup_idx].duplicated = 1;
				1606
				1607	/* Store the index of the original node. */
				1608	dfa->org_indices[dup_idx] = org_idx;
				1609	}
				1610	return dup_idx;
				1611	}
				1612
				1613	static reg_errcode_t
				1614	calc_inveclosure (re_dfa_t *dfa)
				1615	{
				1616	int src, idx, ret;
				1617	for (idx = 0; idx < dfa->nodes_len; ++idx)
				1618	re_node_set_init_empty (dfa->inveclosures + idx);
				1619
				1620	for (src = 0; src < dfa->nodes_len; ++src)
				1621	{
				1622	int *elems = dfa->eclosures[src].elems;
				1623	for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
				1624	{
				1625	ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
				1626	if (BE (ret == -1, 0))
				1627	return REG_ESPACE;
				1628	}
				1629	}
				1630
				1631	return REG_NOERROR;
				1632	}
				1633
				1634	/* Calculate "eclosure" for all the node in DFA. */
				1635
				1636	static reg_errcode_t
				1637	calc_eclosure (re_dfa_t *dfa)
				1638	{
				1639	int node_idx, incomplete;
				1640	#ifdef DEBUG
				1641	assert (dfa->nodes_len > 0);
				1642	#endif
				1643	incomplete = 0;
				1644	/* For each nodes, calculate epsilon closure. */
				1645	for (node_idx = 0; ; ++node_idx)
				1646	{
				1647	reg_errcode_t err;
				1648	re_node_set eclosure_elem;
				1649	if (node_idx == dfa->nodes_len)
				1650	{
				1651	if (!incomplete)
				1652	break;
				1653	incomplete = 0;
				1654	node_idx = 0;
				1655	}
				1656
				1657	#ifdef DEBUG
				1658	assert (dfa->eclosures[node_idx].nelem != -1);
				1659	#endif
				1660
				1661	/* If we have already calculated, skip it. */
				1662	if (dfa->eclosures[node_idx].nelem != 0)
				1663	continue;
				1664	/* Calculate epsilon closure of `node_idx'. */
				1665	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
				1666	if (BE (err != REG_NOERROR, 0))
				1667	return err;
				1668
				1669	if (dfa->eclosures[node_idx].nelem == 0)
				1670	{
				1671	incomplete = 1;
				1672	re_node_set_free (&eclosure_elem);
				1673	}
				1674	}
				1675	return REG_NOERROR;
				1676	}
				1677
				1678	/* Calculate epsilon closure of NODE. */
				1679
				1680	static reg_errcode_t
				1681	calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa, int node, int root)
				1682	{
				1683	reg_errcode_t err;
				1684	int i;
				1685	re_node_set eclosure;
				1686	int ret;
				1687	int incomplete = 0;
				1688	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
				1689	if (BE (err != REG_NOERROR, 0))
				1690	return err;
				1691
				1692	/* This indicates that we are calculating this node now.
				1693	We reference this value to avoid infinite loop. */
				1694	dfa->eclosures[node].nelem = -1;
				1695
				1696	/* If the current node has constraints, duplicate all nodes
				1697	since they must inherit the constraints. */
				1698	if (dfa->nodes[node].constraint
				1699	&& dfa->edests[node].nelem
				1700	&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
				1701	{
				1702	err = duplicate_node_closure (dfa, node, node, node,
				1703	dfa->nodes[node].constraint);
				1704	if (BE (err != REG_NOERROR, 0))
				1705	return err;
				1706	}
				1707
				1708	/* Expand each epsilon destination nodes. */
				1709	if (IS_EPSILON_NODE(dfa->nodes[node].type))
				1710	for (i = 0; i < dfa->edests[node].nelem; ++i)
				1711	{
				1712	re_node_set eclosure_elem;
				1713	int edest = dfa->edests[node].elems[i];
				1714	/* If calculating the epsilon closure of `edest' is in progress,
				1715	return intermediate result. */
				1716	if (dfa->eclosures[edest].nelem == -1)
				1717	{
				1718	incomplete = 1;
				1719	continue;
				1720	}
				1721	/* If we haven't calculated the epsilon closure of `edest' yet,
				1722	calculate now. Otherwise use calculated epsilon closure. */
				1723	if (dfa->eclosures[edest].nelem == 0)
				1724	{
				1725	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
				1726	if (BE (err != REG_NOERROR, 0))
				1727	return err;
				1728	}
				1729	else
				1730	eclosure_elem = dfa->eclosures[edest];
				1731	/* Merge the epsilon closure of `edest'. */
				1732	err = re_node_set_merge (&eclosure, &eclosure_elem);
				1733	if (BE (err != REG_NOERROR, 0))
				1734	return err;
				1735	/* If the epsilon closure of `edest' is incomplete,
				1736	the epsilon closure of this node is also incomplete. */
				1737	if (dfa->eclosures[edest].nelem == 0)
				1738	{
				1739	incomplete = 1;
				1740	re_node_set_free (&eclosure_elem);
				1741	}
				1742	}
				1743
				1744	/* An epsilon closure includes itself. */
				1745	ret = re_node_set_insert (&eclosure, node);
				1746	if (BE (ret < 0, 0))
				1747	return REG_ESPACE;
				1748	if (incomplete && !root)
				1749	dfa->eclosures[node].nelem = 0;
				1750	else
				1751	dfa->eclosures[node] = eclosure;
				1752	*new_set = eclosure;
				1753	return REG_NOERROR;
				1754	}
				1755
				1756	/* Functions for token which are used in the parser. */
				1757
				1758	/* Fetch a token from INPUT.
				1759	We must not use this function inside bracket expressions. */
				1760
				1761	static void
				1762	internal_function
				1763	fetch_token (re_token_t result, re_string_t input, reg_syntax_t syntax)
				1764	{
				1765	re_string_skip_bytes (input, peek_token (result, input, syntax));
				1766	}
				1767
				1768	/* Peek a token from INPUT, and return the length of the token.
				1769	We must not use this function inside bracket expressions. */
				1770
				1771	static int
				1772	internal_function
				1773	peek_token (re_token_t token, re_string_t input, reg_syntax_t syntax)
				1774	{
				1775	unsigned char c;
				1776
				1777	if (re_string_eoi (input))
				1778	{
				1779	token->type = END_OF_RE;
				1780	return 0;
				1781	}
				1782
				1783	c = re_string_peek_byte (input, 0);
				1784	token->opr.c = c;
				1785
				1786	token->word_char = 0;
				1787	#ifdef RE_ENABLE_I18N
				1788	token->mb_partial = 0;
				1789	if (input->mb_cur_max > 1 &&
				1790	!re_string_first_byte (input, re_string_cur_idx (input)))
				1791	{
				1792	token->type = CHARACTER;
				1793	token->mb_partial = 1;
				1794	return 1;
				1795	}
				1796	#endif
				1797	if (c == '\\')
				1798	{
				1799	unsigned char c2;
				1800	if (re_string_cur_idx (input) + 1 >= re_string_length (input))
				1801	{
				1802	token->type = BACK_SLASH;
				1803	return 1;
				1804	}
				1805
				1806	c2 = re_string_peek_byte_case (input, 1);
				1807	token->opr.c = c2;
				1808	token->type = CHARACTER;
				1809	#ifdef RE_ENABLE_I18N
				1810	if (input->mb_cur_max > 1)
				1811	{
				1812	wint_t wc = re_string_wchar_at (input,
				1813	re_string_cur_idx (input) + 1);
				1814	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
				1815	}
				1816	else
				1817	#endif
				1818	token->word_char = IS_WORD_CHAR (c2) != 0;
				1819
				1820	switch (c2)
				1821	{
				1822	case '\|':
				1823	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
				1824	token->type = OP_ALT;
				1825	break;
				1826	case '1': case '2': case '3': case '4': case '5':
				1827	case '6': case '7': case '8': case '9':
				1828	if (!(syntax & RE_NO_BK_REFS))
				1829	{
				1830	token->type = OP_BACK_REF;
				1831	token->opr.idx = c2 - '1';
				1832	}
				1833	break;
				1834	case '<':
				1835	if (!(syntax & RE_NO_GNU_OPS))
				1836	{
				1837	token->type = ANCHOR;
				1838	token->opr.ctx_type = WORD_FIRST;
				1839	}
				1840	break;
				1841	case '>':
				1842	if (!(syntax & RE_NO_GNU_OPS))
				1843	{
				1844	token->type = ANCHOR;
				1845	token->opr.ctx_type = WORD_LAST;
				1846	}
				1847	break;
				1848	case 'b':
				1849	if (!(syntax & RE_NO_GNU_OPS))
				1850	{
				1851	token->type = ANCHOR;
				1852	token->opr.ctx_type = WORD_DELIM;
				1853	}
				1854	break;
				1855	case 'B':
				1856	if (!(syntax & RE_NO_GNU_OPS))
				1857	{
				1858	token->type = ANCHOR;
				1859	token->opr.ctx_type = NOT_WORD_DELIM;
				1860	}
				1861	break;
				1862	case 'w':
				1863	if (!(syntax & RE_NO_GNU_OPS))
				1864	token->type = OP_WORD;
				1865	break;
				1866	case 'W':
				1867	if (!(syntax & RE_NO_GNU_OPS))
				1868	token->type = OP_NOTWORD;
				1869	break;
				1870	case 's':
				1871	if (!(syntax & RE_NO_GNU_OPS))
				1872	token->type = OP_SPACE;
				1873	break;
				1874	case 'S':
				1875	if (!(syntax & RE_NO_GNU_OPS))
				1876	token->type = OP_NOTSPACE;
				1877	break;
				1878	case '`':
				1879	if (!(syntax & RE_NO_GNU_OPS))
				1880	{
				1881	token->type = ANCHOR;
				1882	token->opr.ctx_type = BUF_FIRST;
				1883	}
				1884	break;
				1885	case '\'':
				1886	if (!(syntax & RE_NO_GNU_OPS))
				1887	{
				1888	token->type = ANCHOR;
				1889	token->opr.ctx_type = BUF_LAST;
				1890	}
				1891	break;
				1892	case '(':
				1893	if (!(syntax & RE_NO_BK_PARENS))
				1894	token->type = OP_OPEN_SUBEXP;
				1895	break;
				1896	case ')':
				1897	if (!(syntax & RE_NO_BK_PARENS))
				1898	token->type = OP_CLOSE_SUBEXP;
				1899	break;
				1900	case '+':
				1901	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
				1902	token->type = OP_DUP_PLUS;
				1903	break;
				1904	case '?':
				1905	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
				1906	token->type = OP_DUP_QUESTION;
				1907	break;
				1908	case '{':
				1909	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
				1910	token->type = OP_OPEN_DUP_NUM;
				1911	break;
				1912	case '}':
				1913	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
				1914	token->type = OP_CLOSE_DUP_NUM;
				1915	break;
				1916	default:
				1917	break;
				1918	}
				1919	return 2;
				1920	}
				1921
				1922	token->type = CHARACTER;
				1923	#ifdef RE_ENABLE_I18N
				1924	if (input->mb_cur_max > 1)
				1925	{
				1926	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
				1927	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
				1928	}
				1929	else
				1930	#endif
				1931	token->word_char = IS_WORD_CHAR (token->opr.c);
				1932
				1933	switch (c)
				1934	{
				1935	case '\n':
				1936	if (syntax & RE_NEWLINE_ALT)
				1937	token->type = OP_ALT;
				1938	break;
				1939	case '\|':
				1940	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
				1941	token->type = OP_ALT;
				1942	break;
				1943	case '*':
				1944	token->type = OP_DUP_ASTERISK;
				1945	break;
				1946	case '+':
				1947	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
				1948	token->type = OP_DUP_PLUS;
				1949	break;
				1950	case '?':
				1951	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
				1952	token->type = OP_DUP_QUESTION;
				1953	break;
				1954	case '{':
				1955	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
				1956	token->type = OP_OPEN_DUP_NUM;
				1957	break;
				1958	case '}':
				1959	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
				1960	token->type = OP_CLOSE_DUP_NUM;
				1961	break;
				1962	case '(':
				1963	if (syntax & RE_NO_BK_PARENS)
				1964	token->type = OP_OPEN_SUBEXP;
				1965	break;
				1966	case ')':
				1967	if (syntax & RE_NO_BK_PARENS)
				1968	token->type = OP_CLOSE_SUBEXP;
				1969	break;
				1970	case '[':
				1971	token->type = OP_OPEN_BRACKET;
				1972	break;
				1973	case '.':
				1974	token->type = OP_PERIOD;
				1975	break;
				1976	case '^':
				1977	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE)) &&
				1978	re_string_cur_idx (input) != 0)
				1979	{
				1980	char prev = re_string_peek_byte (input, -1);
				1981	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != '\n')
				1982	break;
				1983	}
				1984	token->type = ANCHOR;
				1985	token->opr.ctx_type = LINE_FIRST;
				1986	break;
				1987	case '$':
				1988	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
				1989	re_string_cur_idx (input) + 1 != re_string_length (input))
				1990	{
				1991	re_token_t next;
				1992	re_string_skip_bytes (input, 1);
				1993	peek_token (&next, input, syntax);
				1994	re_string_skip_bytes (input, -1);
				1995	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
				1996	break;
				1997	}
				1998	token->type = ANCHOR;
				1999	token->opr.ctx_type = LINE_LAST;
				2000	break;
				2001	default:
				2002	break;
				2003	}
				2004	return 1;
				2005	}
				2006
				2007	/* Peek a token from INPUT, and return the length of the token.
				2008	We must not use this function out of bracket expressions. */
				2009
				2010	static int
				2011	internal_function
				2012	peek_token_bracket (re_token_t token, re_string_t input, reg_syntax_t syntax)
				2013	{
				2014	unsigned char c;
				2015	if (re_string_eoi (input))
				2016	{
				2017	token->type = END_OF_RE;
				2018	return 0;
				2019	}
				2020	c = re_string_peek_byte (input, 0);
				2021	token->opr.c = c;
				2022
				2023	#ifdef RE_ENABLE_I18N
				2024	if (input->mb_cur_max > 1 &&
				2025	!re_string_first_byte (input, re_string_cur_idx (input)))
				2026	{
				2027	token->type = CHARACTER;
				2028	return 1;
				2029	}
				2030	#endif /* RE_ENABLE_I18N */
				2031
				2032	if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
				2033	&& re_string_cur_idx (input) + 1 < re_string_length (input))
				2034	{
				2035	/* In this case, '\' escape a character. */
				2036	unsigned char c2;
				2037	re_string_skip_bytes (input, 1);
				2038	c2 = re_string_peek_byte (input, 0);
				2039	token->opr.c = c2;
				2040	token->type = CHARACTER;
				2041	return 1;
				2042	}
				2043	if (c == '[') /* '[' is a special char in a bracket exps. */
				2044	{
				2045	unsigned char c2;
				2046	int token_len;
				2047	if (re_string_cur_idx (input) + 1 < re_string_length (input))
				2048	c2 = re_string_peek_byte (input, 1);
				2049	else
				2050	c2 = 0;
				2051	token->opr.c = c2;
				2052	token_len = 2;
				2053	switch (c2)
				2054	{
				2055	case '.':
				2056	token->type = OP_OPEN_COLL_ELEM;
				2057	break;
				2058	case '=':
				2059	token->type = OP_OPEN_EQUIV_CLASS;
				2060	break;
				2061	case ':':
				2062	if (syntax & RE_CHAR_CLASSES)
				2063	{
				2064	token->type = OP_OPEN_CHAR_CLASS;
				2065	break;
				2066	}
				2067	/* else fall through. */
				2068	default:
				2069	token->type = CHARACTER;
				2070	token->opr.c = c;
				2071	token_len = 1;
				2072	break;
				2073	}
				2074	return token_len;
				2075	}
				2076	switch (c)
				2077	{
				2078	case '-':
				2079	token->type = OP_CHARSET_RANGE;
				2080	break;
				2081	case ']':
				2082	token->type = OP_CLOSE_BRACKET;
				2083	break;
				2084	case '^':
				2085	token->type = OP_NON_MATCH_LIST;
				2086	break;
				2087	default:
				2088	token->type = CHARACTER;
				2089	}
				2090	return 1;
				2091	}
				2092
				2093	/* Functions for parser. */
				2094
				2095	/* Entry point of the parser.
				2096	Parse the regular expression REGEXP and return the structure tree.
Stefano Lattarini	ce9171c	2013-04-12 00:36:10 +0200	[diff] [blame]	2097	If an error has occurred, ERR is set by error code, and return NULL.
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	2098	This function build the following tree, from regular expression <reg_exp>:
				2099	CAT
				2100	/ \
				2101	/ \
				2102	<reg_exp> EOR
				2103
				2104	CAT means concatenation.
				2105	EOR means end of regular expression. */
				2106
				2107	static bin_tree_t *
				2108	parse (re_string_t regexp, regex_t preg, reg_syntax_t syntax,
				2109	reg_errcode_t *err)
				2110	{
				2111	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				2112	bin_tree_t tree, eor, *root;
				2113	re_token_t current_token;
				2114	dfa->syntax = syntax;
				2115	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
				2116	tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
				2117	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2118	return NULL;
				2119	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
				2120	if (tree != NULL)
				2121	root = create_tree (dfa, tree, eor, CONCAT);
				2122	else
				2123	root = eor;
				2124	if (BE (eor == NULL \|\| root == NULL, 0))
				2125	{
				2126	*err = REG_ESPACE;
				2127	return NULL;
				2128	}
				2129	return root;
				2130	}
				2131
				2132	/* This function build the following tree, from regular expression
				2133	<branch1>\|<branch2>:
				2134	ALT
				2135	/ \
				2136	/ \
				2137	<branch1> <branch2>
				2138
				2139	ALT means alternative, which represents the operator `\|'. */
				2140
				2141	static bin_tree_t *
				2142	parse_reg_exp (re_string_t regexp, regex_t preg, re_token_t *token,
				2143	reg_syntax_t syntax, int nest, reg_errcode_t *err)
				2144	{
				2145	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				2146	bin_tree_t tree, branch = NULL;
				2147	tree = parse_branch (regexp, preg, token, syntax, nest, err);
				2148	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2149	return NULL;
				2150
				2151	while (token->type == OP_ALT)
				2152	{
				2153	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
				2154	if (token->type != OP_ALT && token->type != END_OF_RE
				2155	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
				2156	{
				2157	branch = parse_branch (regexp, preg, token, syntax, nest, err);
				2158	if (BE (*err != REG_NOERROR && branch == NULL, 0))
				2159	return NULL;
				2160	}
				2161	else
				2162	branch = NULL;
				2163	tree = create_tree (dfa, tree, branch, OP_ALT);
				2164	if (BE (tree == NULL, 0))
				2165	{
				2166	*err = REG_ESPACE;
				2167	return NULL;
				2168	}
				2169	}
				2170	return tree;
				2171	}
				2172
				2173	/* This function build the following tree, from regular expression
				2174	<exp1><exp2>:
				2175	CAT
				2176	/ \
				2177	/ \
				2178	<exp1> <exp2>
				2179
				2180	CAT means concatenation. */
				2181
				2182	static bin_tree_t *
				2183	parse_branch (re_string_t regexp, regex_t preg, re_token_t *token,
				2184	reg_syntax_t syntax, int nest, reg_errcode_t *err)
				2185	{
				2186	bin_tree_t tree, exp;
				2187	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				2188	tree = parse_expression (regexp, preg, token, syntax, nest, err);
				2189	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2190	return NULL;
				2191
				2192	while (token->type != OP_ALT && token->type != END_OF_RE
				2193	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
				2194	{
				2195	exp = parse_expression (regexp, preg, token, syntax, nest, err);
				2196	if (BE (*err != REG_NOERROR && exp == NULL, 0))
				2197	{
				2198	return NULL;
				2199	}
				2200	if (tree != NULL && exp != NULL)
				2201	{
				2202	tree = create_tree (dfa, tree, exp, CONCAT);
				2203	if (tree == NULL)
				2204	{
				2205	*err = REG_ESPACE;
				2206	return NULL;
				2207	}
				2208	}
				2209	else if (tree == NULL)
				2210	tree = exp;
				2211	/* Otherwise exp == NULL, we don't need to create new tree. */
				2212	}
				2213	return tree;
				2214	}
				2215
				2216	/* This function build the following tree, from regular expression a*:
				2217	*
				2218	\|
				2219	a
				2220	*/
				2221
				2222	static bin_tree_t *
				2223	parse_expression (re_string_t regexp, regex_t preg, re_token_t *token,
				2224	reg_syntax_t syntax, int nest, reg_errcode_t *err)
				2225	{
				2226	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				2227	bin_tree_t *tree;
				2228	switch (token->type)
				2229	{
				2230	case CHARACTER:
				2231	tree = create_token_tree (dfa, NULL, NULL, token);
				2232	if (BE (tree == NULL, 0))
				2233	{
				2234	*err = REG_ESPACE;
				2235	return NULL;
				2236	}
				2237	#ifdef RE_ENABLE_I18N
				2238	if (dfa->mb_cur_max > 1)
				2239	{
				2240	while (!re_string_eoi (regexp)
				2241	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
				2242	{
				2243	bin_tree_t *mbc_remain;
				2244	fetch_token (token, regexp, syntax);
				2245	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
				2246	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
				2247	if (BE (mbc_remain == NULL \|\| tree == NULL, 0))
				2248	{
				2249	*err = REG_ESPACE;
				2250	return NULL;
				2251	}
				2252	}
				2253	}
				2254	#endif
				2255	break;
				2256	case OP_OPEN_SUBEXP:
				2257	tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
				2258	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2259	return NULL;
				2260	break;
				2261	case OP_OPEN_BRACKET:
				2262	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
				2263	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2264	return NULL;
				2265	break;
				2266	case OP_BACK_REF:
				2267	if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
				2268	{
				2269	*err = REG_ESUBREG;
				2270	return NULL;
				2271	}
				2272	dfa->used_bkref_map \|= 1 << token->opr.idx;
				2273	tree = create_token_tree (dfa, NULL, NULL, token);
				2274	if (BE (tree == NULL, 0))
				2275	{
				2276	*err = REG_ESPACE;
				2277	return NULL;
				2278	}
				2279	++dfa->nbackref;
				2280	dfa->has_mb_node = 1;
				2281	break;
				2282	case OP_OPEN_DUP_NUM:
				2283	if (syntax & RE_CONTEXT_INVALID_DUP)
				2284	{
				2285	*err = REG_BADRPT;
				2286	return NULL;
				2287	}
				2288	/* FALLTHROUGH */
				2289	case OP_DUP_ASTERISK:
				2290	case OP_DUP_PLUS:
				2291	case OP_DUP_QUESTION:
				2292	if (syntax & RE_CONTEXT_INVALID_OPS)
				2293	{
				2294	*err = REG_BADRPT;
				2295	return NULL;
				2296	}
				2297	else if (syntax & RE_CONTEXT_INDEP_OPS)
				2298	{
				2299	fetch_token (token, regexp, syntax);
				2300	return parse_expression (regexp, preg, token, syntax, nest, err);
				2301	}
				2302	/* else fall through */
				2303	case OP_CLOSE_SUBEXP:
				2304	if ((token->type == OP_CLOSE_SUBEXP) &&
				2305	!(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
				2306	{
				2307	*err = REG_ERPAREN;
				2308	return NULL;
				2309	}
				2310	/* else fall through */
				2311	case OP_CLOSE_DUP_NUM:
				2312	/* We treat it as a normal character. */
				2313
				2314	/* Then we can these characters as normal characters. */
				2315	token->type = CHARACTER;
				2316	/* mb_partial and word_char bits should be initialized already
				2317	by peek_token. */
				2318	tree = create_token_tree (dfa, NULL, NULL, token);
				2319	if (BE (tree == NULL, 0))
				2320	{
				2321	*err = REG_ESPACE;
				2322	return NULL;
				2323	}
				2324	break;
				2325	case ANCHOR:
				2326	if ((token->opr.ctx_type
				2327	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
				2328	&& dfa->word_ops_used == 0)
				2329	init_word_char (dfa);
				2330	if (token->opr.ctx_type == WORD_DELIM
				2331	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
				2332	{
				2333	bin_tree_t tree_first, tree_last;
				2334	if (token->opr.ctx_type == WORD_DELIM)
				2335	{
				2336	token->opr.ctx_type = WORD_FIRST;
				2337	tree_first = create_token_tree (dfa, NULL, NULL, token);
				2338	token->opr.ctx_type = WORD_LAST;
				2339	}
				2340	else
				2341	{
				2342	token->opr.ctx_type = INSIDE_WORD;
				2343	tree_first = create_token_tree (dfa, NULL, NULL, token);
				2344	token->opr.ctx_type = INSIDE_NOTWORD;
				2345	}
				2346	tree_last = create_token_tree (dfa, NULL, NULL, token);
				2347	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
				2348	if (BE (tree_first == NULL \|\| tree_last == NULL \|\| tree == NULL, 0))
				2349	{
				2350	*err = REG_ESPACE;
				2351	return NULL;
				2352	}
				2353	}
				2354	else
				2355	{
				2356	tree = create_token_tree (dfa, NULL, NULL, token);
				2357	if (BE (tree == NULL, 0))
				2358	{
				2359	*err = REG_ESPACE;
				2360	return NULL;
				2361	}
				2362	}
				2363	/* We must return here, since ANCHORs can't be followed
				2364	by repetition operators.
				2365	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
				2366	it must not be "<ANCHOR(^)><REPEAT()>". /
				2367	fetch_token (token, regexp, syntax);
				2368	return tree;
				2369	case OP_PERIOD:
				2370	tree = create_token_tree (dfa, NULL, NULL, token);
				2371	if (BE (tree == NULL, 0))
				2372	{
				2373	*err = REG_ESPACE;
				2374	return NULL;
				2375	}
				2376	if (dfa->mb_cur_max > 1)
				2377	dfa->has_mb_node = 1;
				2378	break;
				2379	case OP_WORD:
				2380	case OP_NOTWORD:
				2381	tree = build_charclass_op (dfa, regexp->trans,
				2382	"alnum",
				2383	"_",
				2384	token->type == OP_NOTWORD, err);
				2385	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2386	return NULL;
				2387	break;
				2388	case OP_SPACE:
				2389	case OP_NOTSPACE:
				2390	tree = build_charclass_op (dfa, regexp->trans,
				2391	"space",
				2392	"",
				2393	token->type == OP_NOTSPACE, err);
				2394	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2395	return NULL;
				2396	break;
				2397	case OP_ALT:
				2398	case END_OF_RE:
				2399	return NULL;
				2400	case BACK_SLASH:
				2401	*err = REG_EESCAPE;
				2402	return NULL;
				2403	default:
				2404	/* Must not happen? */
				2405	#ifdef DEBUG
				2406	assert (0);
				2407	#endif
				2408	return NULL;
				2409	}
				2410	fetch_token (token, regexp, syntax);
				2411
				2412	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
				2413	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
				2414	{
				2415	tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
				2416	if (BE (*err != REG_NOERROR && tree == NULL, 0))
				2417	return NULL;
				2418	/* In BRE consecutive duplications are not allowed. */
				2419	if ((syntax & RE_CONTEXT_INVALID_DUP)
				2420	&& (token->type == OP_DUP_ASTERISK
				2421	\|\| token->type == OP_OPEN_DUP_NUM))
				2422	{
				2423	*err = REG_BADRPT;
				2424	return NULL;
				2425	}
				2426	}
				2427
				2428	return tree;
				2429	}
				2430
				2431	/* This function build the following tree, from regular expression
				2432	(<reg_exp>):
				2433	SUBEXP
				2434	\|
				2435	<reg_exp>
				2436	*/
				2437
				2438	static bin_tree_t *
				2439	parse_sub_exp (re_string_t regexp, regex_t preg, re_token_t *token,
				2440	reg_syntax_t syntax, int nest, reg_errcode_t *err)
				2441	{
				2442	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
				2443	bin_tree_t *tree;
				2444	size_t cur_nsub;
				2445	cur_nsub = preg->re_nsub++;
				2446
				2447	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
				2448
				2449	/* The subexpression may be a null string. */
				2450	if (token->type == OP_CLOSE_SUBEXP)
				2451	tree = NULL;
				2452	else
				2453	{
				2454	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
				2455	if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
				2456	*err = REG_EPAREN;
				2457	if (BE (*err != REG_NOERROR, 0))
				2458	return NULL;
				2459	}
				2460
				2461	if (cur_nsub <= '9' - '1')
				2462	dfa->completed_bkref_map \|= 1 << cur_nsub;
				2463
				2464	tree = create_tree (dfa, tree, NULL, SUBEXP);
				2465	if (BE (tree == NULL, 0))
				2466	{
				2467	*err = REG_ESPACE;
				2468	return NULL;
				2469	}
				2470	tree->token.opr.idx = cur_nsub;
				2471	return tree;
				2472	}
				2473
				2474	/* This function parse repetition operators like "", "+", "{1,3}" etc. /
				2475
				2476	static bin_tree_t *
				2477	parse_dup_op (bin_tree_t elem, re_string_t regexp, re_dfa_t *dfa,
				2478	re_token_t token, reg_syntax_t syntax, reg_errcode_t err)
				2479	{
				2480	bin_tree_t tree = NULL, old_tree = NULL;
				2481	int i, start, end, start_idx = re_string_cur_idx (regexp);
				2482	#ifndef RE_TOKEN_INIT_BUG
				2483	re_token_t start_token = *token;
				2484	#else
				2485	re_token_t start_token;
				2486
				2487	memcpy ((void ) &start_token, (void ) token, sizeof start_token);
				2488	#endif
				2489
				2490	if (token->type == OP_OPEN_DUP_NUM)
				2491	{
				2492	end = 0;
				2493	start = fetch_number (regexp, token, syntax);
				2494	if (start == -1)
				2495	{
				2496	if (token->type == CHARACTER && token->opr.c == ',')
				2497	start = 0; /* We treat "{,m}" as "{0,m}". */
				2498	else
				2499	{
				2500	err = REG_BADBR; / <re>{} is invalid. */
				2501	return NULL;
				2502	}
				2503	}
				2504	if (BE (start != -2, 1))
				2505	{
				2506	/* We treat "{n}" as "{n,n}". */
				2507	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
				2508	: ((token->type == CHARACTER && token->opr.c == ',')
				2509	? fetch_number (regexp, token, syntax) : -2));
				2510	}
				2511	if (BE (start == -2 \|\| end == -2, 0))
				2512	{
				2513	/* Invalid sequence. */
				2514	if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
				2515	{
				2516	if (token->type == END_OF_RE)
				2517	*err = REG_EBRACE;
				2518	else
				2519	*err = REG_BADBR;
				2520
				2521	return NULL;
				2522	}
				2523
				2524	/* If the syntax bit is set, rollback. */
				2525	re_string_set_index (regexp, start_idx);
				2526	*token = start_token;
				2527	token->type = CHARACTER;
				2528	/* mb_partial and word_char bits should be already initialized by
				2529	peek_token. */
				2530	return elem;
				2531	}
				2532
				2533	if (BE ((end != -1 && start > end) \|\| token->type != OP_CLOSE_DUP_NUM, 0))
				2534	{
				2535	/* First number greater than second. */
				2536	*err = REG_BADBR;
				2537	return NULL;
				2538	}
				2539	}
				2540	else
				2541	{
				2542	start = (token->type == OP_DUP_PLUS) ? 1 : 0;
				2543	end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
				2544	}
				2545
				2546	fetch_token (token, regexp, syntax);
				2547
				2548	if (BE (elem == NULL, 0))
				2549	return NULL;
				2550	if (BE (start == 0 && end == 0, 0))
				2551	{
				2552	postorder (elem, free_tree, NULL);
				2553	return NULL;
				2554	}
				2555
				2556	/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
				2557	if (BE (start > 0, 0))
				2558	{
				2559	tree = elem;
				2560	for (i = 2; i <= start; ++i)
				2561	{
				2562	elem = duplicate_tree (elem, dfa);
				2563	tree = create_tree (dfa, tree, elem, CONCAT);
				2564	if (BE (elem == NULL \|\| tree == NULL, 0))
				2565	goto parse_dup_op_espace;
				2566	}
				2567
				2568	if (start == end)
				2569	return tree;
				2570
				2571	/* Duplicate ELEM before it is marked optional. */
				2572	elem = duplicate_tree (elem, dfa);
				2573	old_tree = tree;
				2574	}
				2575	else
				2576	old_tree = NULL;
				2577
				2578	if (elem->token.type == SUBEXP)
Johannes Schindelin	56a1a3a	2015-10-26 14:15:25 +0100	[diff] [blame]	2579	postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	2580
				2581	tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
				2582	if (BE (tree == NULL, 0))
				2583	goto parse_dup_op_espace;
				2584
				2585	/* This loop is actually executed only when end != -1,
				2586	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
				2587	already created the start+1-th copy. */
				2588	for (i = start + 2; i <= end; ++i)
				2589	{
				2590	elem = duplicate_tree (elem, dfa);
				2591	tree = create_tree (dfa, tree, elem, CONCAT);
				2592	if (BE (elem == NULL \|\| tree == NULL, 0))
				2593	goto parse_dup_op_espace;
				2594
				2595	tree = create_tree (dfa, tree, NULL, OP_ALT);
				2596	if (BE (tree == NULL, 0))
				2597	goto parse_dup_op_espace;
				2598	}
				2599
				2600	if (old_tree)
				2601	tree = create_tree (dfa, old_tree, tree, CONCAT);
				2602
				2603	return tree;
				2604
				2605	parse_dup_op_espace:
				2606	*err = REG_ESPACE;
				2607	return NULL;
				2608	}
				2609
				2610	/* Size of the names for collating symbol/equivalence_class/character_class.
				2611	I'm not sure, but maybe enough. */
				2612	#define BRACKET_NAME_BUF_SIZE 32
				2613
				2614	#ifndef _LIBC
				2615	/* Local function for parse_bracket_exp only used in case of NOT _LIBC.
				2616	Build the range expression which starts from START_ELEM, and ends
				2617	at END_ELEM. The result are written to MBCSET and SBCSET.
				2618	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
Ondřej Bílka	749f763	2013-07-22 23:02:23 +0200	[diff] [blame]	2619	mbcset->range_ends, is a pointer argument since we may
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	2620	update it. */
				2621
				2622	static reg_errcode_t
				2623	internal_function
				2624	# ifdef RE_ENABLE_I18N
				2625	build_range_exp (bitset_t sbcset, re_charset_t mbcset, int range_alloc,
				2626	bracket_elem_t start_elem, bracket_elem_t end_elem)
				2627	# else /* not RE_ENABLE_I18N */
				2628	build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
				2629	bracket_elem_t *end_elem)
				2630	# endif /* not RE_ENABLE_I18N */
				2631	{
				2632	unsigned int start_ch, end_ch;
				2633	/* Equivalence Classes and Character Classes can't be a range start/end. */
				2634	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
				2635	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
				2636	0))
				2637	return REG_ERANGE;
				2638
				2639	/* We can handle no multi character collating elements without libc
				2640	support. */
				2641	if (BE ((start_elem->type == COLL_SYM
				2642	&& strlen ((char *) start_elem->opr.name) > 1)
				2643	\|\| (end_elem->type == COLL_SYM
				2644	&& strlen ((char *) end_elem->opr.name) > 1), 0))
				2645	return REG_ECOLLATE;
				2646
				2647	# ifdef RE_ENABLE_I18N
				2648	{
				2649	wchar_t wc;
				2650	wint_t start_wc;
				2651	wint_t end_wc;
				2652	wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
				2653
				2654	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
				2655	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
				2656	: 0));
				2657	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
				2658	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
				2659	: 0));
				2660	#ifdef GAWK
				2661	/*
				2662	* Fedora Core 2, maybe others, have broken `btowc' that returns -1
				2663	* for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
				2664	* unsigned, so we don't have sign extension problems.
				2665	*/
				2666	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
				2667	? start_ch : start_elem->opr.wch);
				2668	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
				2669	? end_ch : end_elem->opr.wch);
				2670	#else
				2671	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
				2672	? __btowc (start_ch) : start_elem->opr.wch);
				2673	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
				2674	? __btowc (end_ch) : end_elem->opr.wch);
				2675	#endif
				2676	if (start_wc == WEOF \|\| end_wc == WEOF)
				2677	return REG_ECOLLATE;
				2678	cmp_buf[0] = start_wc;
				2679	cmp_buf[4] = end_wc;
				2680	if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
				2681	return REG_ERANGE;
				2682
				2683	/* Got valid collation sequence values, add them as a new entry.
				2684	However, for !_LIBC we have no collation elements: if the
				2685	character set is single byte, the single byte character set
				2686	that we build below suffices. parse_bracket_exp passes
				2687	no MBCSET if dfa->mb_cur_max == 1. */
				2688	if (mbcset)
				2689	{
				2690	/* Check the space of the arrays. */
				2691	if (BE (*range_alloc == mbcset->nranges, 0))
				2692	{
				2693	/* There is not enough space, need realloc. */
				2694	wchar_t new_array_start, new_array_end;
				2695	int new_nranges;
				2696
				2697	/* +1 in case of mbcset->nranges is 0. */
				2698	new_nranges = 2 * mbcset->nranges + 1;
				2699	/* Use realloc since mbcset->range_starts and mbcset->range_ends
				2700	are NULL if range_alloc == 0. /
				2701	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
				2702	new_nranges);
				2703	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
				2704	new_nranges);
				2705
				2706	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
				2707	return REG_ESPACE;
				2708
				2709	mbcset->range_starts = new_array_start;
				2710	mbcset->range_ends = new_array_end;
				2711	*range_alloc = new_nranges;
				2712	}
				2713
				2714	mbcset->range_starts[mbcset->nranges] = start_wc;
				2715	mbcset->range_ends[mbcset->nranges++] = end_wc;
				2716	}
				2717
				2718	/* Build the table for single byte characters. */
				2719	for (wc = 0; wc < SBC_MAX; ++wc)
				2720	{
				2721	cmp_buf[2] = wc;
				2722	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
				2723	&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
				2724	bitset_set (sbcset, wc);
				2725	}
				2726	}
				2727	# else /* not RE_ENABLE_I18N */
				2728	{
				2729	unsigned int ch;
				2730	start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
				2731	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
				2732	: 0));
				2733	end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
				2734	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
				2735	: 0));
				2736	if (start_ch > end_ch)
				2737	return REG_ERANGE;
				2738	/* Build the table for single byte characters. */
				2739	for (ch = 0; ch < SBC_MAX; ++ch)
				2740	if (start_ch <= ch && ch <= end_ch)
				2741	bitset_set (sbcset, ch);
				2742	}
				2743	# endif /* not RE_ENABLE_I18N */
				2744	return REG_NOERROR;
				2745	}
				2746	#endif /* not _LIBC */
				2747
				2748	#ifndef _LIBC
				2749	/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
				2750	Build the collating element which is represented by NAME.
				2751	The result are written to MBCSET and SBCSET.
				2752	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
				2753	pointer argument since we may update it. */
				2754
				2755	static reg_errcode_t
				2756	internal_function
				2757	# ifdef RE_ENABLE_I18N
				2758	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
				2759	int coll_sym_alloc, const unsigned char name)
				2760	# else /* not RE_ENABLE_I18N */
				2761	build_collating_symbol (bitset_t sbcset, const unsigned char *name)
				2762	# endif /* not RE_ENABLE_I18N */
				2763	{
				2764	size_t name_len = strlen ((const char *) name);
				2765	if (BE (name_len != 1, 0))
				2766	return REG_ECOLLATE;
				2767	else
				2768	{
				2769	bitset_set (sbcset, name[0]);
				2770	return REG_NOERROR;
				2771	}
				2772	}
				2773	#endif /* not _LIBC */
				2774
				2775	/* This function parse bracket expression like "[abc]", "[a-c]",
				2776	"[[.a-a.]]" etc. */
				2777
				2778	static bin_tree_t *
				2779	parse_bracket_exp (re_string_t regexp, re_dfa_t dfa, re_token_t *token,
				2780	reg_syntax_t syntax, reg_errcode_t *err)
				2781	{
				2782	#ifdef _LIBC
				2783	const unsigned char *collseqmb;
				2784	const char *collseqwc;
				2785	uint32_t nrules;
				2786	int32_t table_size;
				2787	const int32_t *symb_table;
				2788	const unsigned char *extra;
				2789
Ondřej Bílka	749f763	2013-07-22 23:02:23 +0200	[diff] [blame]	2790	/* Local function for parse_bracket_exp used in _LIBC environment.
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	2791	Seek the collating symbol entry correspondings to NAME.
				2792	Return the index of the symbol in the SYMB_TABLE. */
				2793
				2794	auto inline int32_t
				2795	__attribute ((always_inline))
				2796	seek_collating_symbol_entry (name, name_len)
				2797	const unsigned char *name;
				2798	size_t name_len;
				2799	{
				2800	int32_t hash = elem_hash ((const char *) name, name_len);
				2801	int32_t elem = hash % table_size;
				2802	if (symb_table[2 * elem] != 0)
				2803	{
				2804	int32_t second = hash % (table_size - 2) + 1;
				2805
				2806	do
				2807	{
				2808	/* First compare the hashing value. */
				2809	if (symb_table[2 * elem] == hash
				2810	/* Compare the length of the name. */
				2811	&& name_len == extra[symb_table[2 * elem + 1]]
				2812	/* Compare the name. */
				2813	&& memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
				2814	name_len) == 0)
				2815	{
				2816	/* Yep, this is the entry. */
				2817	break;
				2818	}
				2819
				2820	/* Next entry. */
				2821	elem += second;
				2822	}
				2823	while (symb_table[2 * elem] != 0);
				2824	}
				2825	return elem;
				2826	}
				2827
				2828	/* Local function for parse_bracket_exp used in _LIBC environment.
				2829	Look up the collation sequence value of BR_ELEM.
				2830	Return the value if succeeded, UINT_MAX otherwise. */
				2831
				2832	auto inline unsigned int
				2833	__attribute ((always_inline))
				2834	lookup_collation_sequence_value (br_elem)
				2835	bracket_elem_t *br_elem;
				2836	{
				2837	if (br_elem->type == SB_CHAR)
				2838	{
				2839	/*
				2840	if (MB_CUR_MAX == 1)
				2841	*/
				2842	if (nrules == 0)
				2843	return collseqmb[br_elem->opr.ch];
				2844	else
				2845	{
				2846	wint_t wc = __btowc (br_elem->opr.ch);
				2847	return __collseq_table_lookup (collseqwc, wc);
				2848	}
				2849	}
				2850	else if (br_elem->type == MB_CHAR)
				2851	{
				2852	if (nrules != 0)
				2853	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
				2854	}
				2855	else if (br_elem->type == COLL_SYM)
				2856	{
				2857	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
				2858	if (nrules != 0)
				2859	{
				2860	int32_t elem, idx;
				2861	elem = seek_collating_symbol_entry (br_elem->opr.name,
				2862	sym_name_len);
				2863	if (symb_table[2 * elem] != 0)
				2864	{
				2865	/* We found the entry. */
				2866	idx = symb_table[2 * elem + 1];
				2867	/* Skip the name of collating element name. */
				2868	idx += 1 + extra[idx];
				2869	/* Skip the byte sequence of the collating element. */
				2870	idx += 1 + extra[idx];
				2871	/* Adjust for the alignment. */
				2872	idx = (idx + 3) & ~3;
				2873	/* Skip the multibyte collation sequence value. */
				2874	idx += sizeof (unsigned int);
				2875	/* Skip the wide char sequence of the collating element. */
				2876	idx += sizeof (unsigned int) *
				2877	(1 + (unsigned int ) (extra + idx));
				2878	/* Return the collation sequence value. */
				2879	return (unsigned int ) (extra + idx);
				2880	}
				2881	else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
				2882	{
				2883	/* No valid character. Match it as a single byte
				2884	character. */
				2885	return collseqmb[br_elem->opr.name[0]];
				2886	}
				2887	}
				2888	else if (sym_name_len == 1)
				2889	return collseqmb[br_elem->opr.name[0]];
				2890	}
				2891	return UINT_MAX;
				2892	}
				2893
Ondřej Bílka	749f763	2013-07-22 23:02:23 +0200	[diff] [blame]	2894	/* Local function for parse_bracket_exp used in _LIBC environment.
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	2895	Build the range expression which starts from START_ELEM, and ends
				2896	at END_ELEM. The result are written to MBCSET and SBCSET.
				2897	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
Ondřej Bílka	749f763	2013-07-22 23:02:23 +0200	[diff] [blame]	2898	mbcset->range_ends, is a pointer argument since we may
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	2899	update it. */
				2900
				2901	auto inline reg_errcode_t
				2902	__attribute ((always_inline))
				2903	build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
				2904	re_charset_t *mbcset;
				2905	int *range_alloc;
				2906	bitset_t sbcset;
				2907	bracket_elem_t start_elem, end_elem;
				2908	{
				2909	unsigned int ch;
				2910	uint32_t start_collseq;
				2911	uint32_t end_collseq;
				2912
				2913	/* Equivalence Classes and Character Classes can't be a range
				2914	start/end. */
				2915	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
				2916	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
				2917	0))
				2918	return REG_ERANGE;
				2919
				2920	start_collseq = lookup_collation_sequence_value (start_elem);
				2921	end_collseq = lookup_collation_sequence_value (end_elem);
				2922	/* Check start/end collation sequence values. */
				2923	if (BE (start_collseq == UINT_MAX \|\| end_collseq == UINT_MAX, 0))
				2924	return REG_ECOLLATE;
				2925	if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
				2926	return REG_ERANGE;
				2927
				2928	/* Got valid collation sequence values, add them as a new entry.
				2929	However, if we have no collation elements, and the character set
				2930	is single byte, the single byte character set that we
				2931	build below suffices. */
				2932	if (nrules > 0 \|\| dfa->mb_cur_max > 1)
				2933	{
				2934	/* Check the space of the arrays. */
				2935	if (BE (*range_alloc == mbcset->nranges, 0))
				2936	{
				2937	/* There is not enough space, need realloc. */
				2938	uint32_t *new_array_start;
				2939	uint32_t *new_array_end;
				2940	int new_nranges;
				2941
				2942	/* +1 in case of mbcset->nranges is 0. */
				2943	new_nranges = 2 * mbcset->nranges + 1;
				2944	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
				2945	new_nranges);
				2946	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
				2947	new_nranges);
				2948
				2949	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
				2950	return REG_ESPACE;
				2951
				2952	mbcset->range_starts = new_array_start;
				2953	mbcset->range_ends = new_array_end;
				2954	*range_alloc = new_nranges;
				2955	}
				2956
				2957	mbcset->range_starts[mbcset->nranges] = start_collseq;
				2958	mbcset->range_ends[mbcset->nranges++] = end_collseq;
				2959	}
				2960
				2961	/* Build the table for single byte characters. */
				2962	for (ch = 0; ch < SBC_MAX; ch++)
				2963	{
				2964	uint32_t ch_collseq;
				2965	/*
				2966	if (MB_CUR_MAX == 1)
				2967	*/
				2968	if (nrules == 0)
				2969	ch_collseq = collseqmb[ch];
				2970	else
				2971	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
				2972	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
				2973	bitset_set (sbcset, ch);
				2974	}
				2975	return REG_NOERROR;
				2976	}
				2977
Ondřej Bílka	749f763	2013-07-22 23:02:23 +0200	[diff] [blame]	2978	/* Local function for parse_bracket_exp used in _LIBC environment.
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	2979	Build the collating element which is represented by NAME.
				2980	The result are written to MBCSET and SBCSET.
				2981	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
Ondřej Bílka	749f763	2013-07-22 23:02:23 +0200	[diff] [blame]	2982	pointer argument since we may update it. */
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	2983
				2984	auto inline reg_errcode_t
				2985	__attribute ((always_inline))
				2986	build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
				2987	re_charset_t *mbcset;
				2988	int *coll_sym_alloc;
				2989	bitset_t sbcset;
				2990	const unsigned char *name;
				2991	{
				2992	int32_t elem, idx;
				2993	size_t name_len = strlen ((const char *) name);
				2994	if (nrules != 0)
				2995	{
				2996	elem = seek_collating_symbol_entry (name, name_len);
				2997	if (symb_table[2 * elem] != 0)
				2998	{
				2999	/* We found the entry. */
				3000	idx = symb_table[2 * elem + 1];
				3001	/* Skip the name of collating element name. */
				3002	idx += 1 + extra[idx];
				3003	}
				3004	else if (symb_table[2 * elem] == 0 && name_len == 1)
				3005	{
				3006	/* No valid character, treat it as a normal
				3007	character. */
				3008	bitset_set (sbcset, name[0]);
				3009	return REG_NOERROR;
				3010	}
				3011	else
				3012	return REG_ECOLLATE;
				3013
				3014	/* Got valid collation sequence, add it as a new entry. */
				3015	/* Check the space of the arrays. */
				3016	if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
				3017	{
				3018	/* Not enough, realloc it. */
				3019	/* +1 in case of mbcset->ncoll_syms is 0. */
				3020	int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
				3021	/* Use realloc since mbcset->coll_syms is NULL
				3022	if alloc == 0. /
				3023	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
				3024	new_coll_sym_alloc);
				3025	if (BE (new_coll_syms == NULL, 0))
				3026	return REG_ESPACE;
				3027	mbcset->coll_syms = new_coll_syms;
				3028	*coll_sym_alloc = new_coll_sym_alloc;
				3029	}
				3030	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
				3031	return REG_NOERROR;
				3032	}
				3033	else
				3034	{
				3035	if (BE (name_len != 1, 0))
				3036	return REG_ECOLLATE;
				3037	else
				3038	{
				3039	bitset_set (sbcset, name[0]);
				3040	return REG_NOERROR;
				3041	}
				3042	}
				3043	}
				3044	#endif
				3045
				3046	re_token_t br_token;
				3047	re_bitset_ptr_t sbcset;
				3048	#ifdef RE_ENABLE_I18N
				3049	re_charset_t *mbcset;
				3050	int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
				3051	int equiv_class_alloc = 0, char_class_alloc = 0;
				3052	#endif /* not RE_ENABLE_I18N */
				3053	int non_match = 0;
				3054	bin_tree_t *work_tree;
				3055	int token_len;
				3056	int first_round = 1;
				3057	#ifdef _LIBC
				3058	collseqmb = (const unsigned char *)
				3059	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
				3060	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
				3061	if (nrules)
				3062	{
				3063	/*
				3064	if (MB_CUR_MAX > 1)
				3065	*/
				3066	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
				3067	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
				3068	symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
				3069	_NL_COLLATE_SYMB_TABLEMB);
				3070	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
				3071	_NL_COLLATE_SYMB_EXTRAMB);
				3072	}
				3073	#endif
				3074	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
				3075	#ifdef RE_ENABLE_I18N
				3076	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
				3077	#endif /* RE_ENABLE_I18N */
				3078	#ifdef RE_ENABLE_I18N
				3079	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
				3080	#else
				3081	if (BE (sbcset == NULL, 0))
				3082	#endif /* RE_ENABLE_I18N */
				3083	{
				3084	*err = REG_ESPACE;
				3085	return NULL;
				3086	}
				3087
				3088	token_len = peek_token_bracket (token, regexp, syntax);
				3089	if (BE (token->type == END_OF_RE, 0))
				3090	{
				3091	*err = REG_BADPAT;
				3092	goto parse_bracket_exp_free_return;
				3093	}
				3094	if (token->type == OP_NON_MATCH_LIST)
				3095	{
				3096	#ifdef RE_ENABLE_I18N
				3097	mbcset->non_match = 1;
				3098	#endif /* not RE_ENABLE_I18N */
				3099	non_match = 1;
				3100	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
				3101	bitset_set (sbcset, '\n');
				3102	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
				3103	token_len = peek_token_bracket (token, regexp, syntax);
				3104	if (BE (token->type == END_OF_RE, 0))
				3105	{
				3106	*err = REG_BADPAT;
				3107	goto parse_bracket_exp_free_return;
				3108	}
				3109	}
				3110
				3111	/* We treat the first ']' as a normal character. */
				3112	if (token->type == OP_CLOSE_BRACKET)
				3113	token->type = CHARACTER;
				3114
				3115	while (1)
				3116	{
				3117	bracket_elem_t start_elem, end_elem;
				3118	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
				3119	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
				3120	reg_errcode_t ret;
				3121	int token_len2 = 0, is_range_exp = 0;
				3122	re_token_t token2;
				3123
				3124	start_elem.opr.name = start_name_buf;
				3125	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
				3126	syntax, first_round);
				3127	if (BE (ret != REG_NOERROR, 0))
				3128	{
				3129	*err = ret;
				3130	goto parse_bracket_exp_free_return;
				3131	}
				3132	first_round = 0;
				3133
				3134	/* Get information about the next token. We need it in any case. */
				3135	token_len = peek_token_bracket (token, regexp, syntax);
				3136
				3137	/* Do not check for ranges if we know they are not allowed. */
				3138	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
				3139	{
				3140	if (BE (token->type == END_OF_RE, 0))
				3141	{
				3142	*err = REG_EBRACK;
				3143	goto parse_bracket_exp_free_return;
				3144	}
				3145	if (token->type == OP_CHARSET_RANGE)
				3146	{
				3147	re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
				3148	token_len2 = peek_token_bracket (&token2, regexp, syntax);
				3149	if (BE (token2.type == END_OF_RE, 0))
				3150	{
				3151	*err = REG_EBRACK;
				3152	goto parse_bracket_exp_free_return;
				3153	}
				3154	if (token2.type == OP_CLOSE_BRACKET)
				3155	{
				3156	/* We treat the last '-' as a normal character. */
				3157	re_string_skip_bytes (regexp, -token_len);
				3158	token->type = CHARACTER;
				3159	}
				3160	else
				3161	is_range_exp = 1;
				3162	}
				3163	}
				3164
				3165	if (is_range_exp == 1)
				3166	{
				3167	end_elem.opr.name = end_name_buf;
				3168	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
				3169	dfa, syntax, 1);
				3170	if (BE (ret != REG_NOERROR, 0))
				3171	{
				3172	*err = ret;
				3173	goto parse_bracket_exp_free_return;
				3174	}
				3175
				3176	token_len = peek_token_bracket (token, regexp, syntax);
				3177
				3178	#ifdef _LIBC
				3179	*err = build_range_exp (sbcset, mbcset, &range_alloc,
				3180	&start_elem, &end_elem);
				3181	#else
				3182	# ifdef RE_ENABLE_I18N
				3183	*err = build_range_exp (sbcset,
				3184	dfa->mb_cur_max > 1 ? mbcset : NULL,
				3185	&range_alloc, &start_elem, &end_elem);
				3186	# else
				3187	*err = build_range_exp (sbcset, &start_elem, &end_elem);
				3188	# endif
				3189	#endif /* RE_ENABLE_I18N */
				3190	if (BE (*err != REG_NOERROR, 0))
				3191	goto parse_bracket_exp_free_return;
				3192	}
				3193	else
				3194	{
				3195	switch (start_elem.type)
				3196	{
				3197	case SB_CHAR:
				3198	bitset_set (sbcset, start_elem.opr.ch);
				3199	break;
				3200	#ifdef RE_ENABLE_I18N
				3201	case MB_CHAR:
				3202	/* Check whether the array has enough space. */
				3203	if (BE (mbchar_alloc == mbcset->nmbchars, 0))
				3204	{
				3205	wchar_t *new_mbchars;
				3206	/* Not enough, realloc it. */
				3207	/* +1 in case of mbcset->nmbchars is 0. */
				3208	mbchar_alloc = 2 * mbcset->nmbchars + 1;
				3209	/* Use realloc since array is NULL if alloc == 0. /
				3210	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
				3211	mbchar_alloc);
				3212	if (BE (new_mbchars == NULL, 0))
				3213	goto parse_bracket_exp_espace;
				3214	mbcset->mbchars = new_mbchars;
				3215	}
				3216	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
				3217	break;
				3218	#endif /* RE_ENABLE_I18N */
				3219	case EQUIV_CLASS:
				3220	*err = build_equiv_class (sbcset,
				3221	#ifdef RE_ENABLE_I18N
				3222	mbcset, &equiv_class_alloc,
				3223	#endif /* RE_ENABLE_I18N */
				3224	start_elem.opr.name);
				3225	if (BE (*err != REG_NOERROR, 0))
				3226	goto parse_bracket_exp_free_return;
				3227	break;
				3228	case COLL_SYM:
				3229	*err = build_collating_symbol (sbcset,
				3230	#ifdef RE_ENABLE_I18N
				3231	mbcset, &coll_sym_alloc,
				3232	#endif /* RE_ENABLE_I18N */
				3233	start_elem.opr.name);
				3234	if (BE (*err != REG_NOERROR, 0))
				3235	goto parse_bracket_exp_free_return;
				3236	break;
				3237	case CHAR_CLASS:
				3238	*err = build_charclass (regexp->trans, sbcset,
				3239	#ifdef RE_ENABLE_I18N
				3240	mbcset, &char_class_alloc,
				3241	#endif /* RE_ENABLE_I18N */
				3242	(const char *) start_elem.opr.name, syntax);
				3243	if (BE (*err != REG_NOERROR, 0))
				3244	goto parse_bracket_exp_free_return;
				3245	break;
				3246	default:
				3247	assert (0);
				3248	break;
				3249	}
				3250	}
				3251	if (BE (token->type == END_OF_RE, 0))
				3252	{
				3253	*err = REG_EBRACK;
				3254	goto parse_bracket_exp_free_return;
				3255	}
				3256	if (token->type == OP_CLOSE_BRACKET)
				3257	break;
				3258	}
				3259
				3260	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
				3261
				3262	/* If it is non-matching list. */
				3263	if (non_match)
				3264	bitset_not (sbcset);
				3265
				3266	#ifdef RE_ENABLE_I18N
				3267	/* Ensure only single byte characters are set. */
				3268	if (dfa->mb_cur_max > 1)
				3269	bitset_mask (sbcset, dfa->sb_char);
				3270
				3271	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
				3272	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
				3273	\|\| mbcset->non_match)))
				3274	{
				3275	bin_tree_t *mbc_tree;
				3276	int sbc_idx;
				3277	/* Build a tree for complex bracket. */
				3278	dfa->has_mb_node = 1;
				3279	br_token.type = COMPLEX_BRACKET;
				3280	br_token.opr.mbcset = mbcset;
				3281	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
				3282	if (BE (mbc_tree == NULL, 0))
				3283	goto parse_bracket_exp_espace;
				3284	for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
				3285	if (sbcset[sbc_idx])
				3286	break;
				3287	/* If there are no bits set in sbcset, there is no point
				3288	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
				3289	if (sbc_idx < BITSET_WORDS)
				3290	{
				3291	/* Build a tree for simple bracket. */
				3292	br_token.type = SIMPLE_BRACKET;
				3293	br_token.opr.sbcset = sbcset;
				3294	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
				3295	if (BE (work_tree == NULL, 0))
				3296	goto parse_bracket_exp_espace;
				3297
				3298	/* Then join them by ALT node. */
				3299	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
				3300	if (BE (work_tree == NULL, 0))
				3301	goto parse_bracket_exp_espace;
				3302	}
				3303	else
				3304	{
				3305	re_free (sbcset);
				3306	work_tree = mbc_tree;
				3307	}
				3308	}
				3309	else
				3310	#endif /* not RE_ENABLE_I18N */
				3311	{
				3312	#ifdef RE_ENABLE_I18N
				3313	free_charset (mbcset);
				3314	#endif
				3315	/* Build a tree for simple bracket. */
				3316	br_token.type = SIMPLE_BRACKET;
				3317	br_token.opr.sbcset = sbcset;
				3318	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
				3319	if (BE (work_tree == NULL, 0))
				3320	goto parse_bracket_exp_espace;
				3321	}
				3322	return work_tree;
				3323
				3324	parse_bracket_exp_espace:
				3325	*err = REG_ESPACE;
				3326	parse_bracket_exp_free_return:
				3327	re_free (sbcset);
				3328	#ifdef RE_ENABLE_I18N
				3329	free_charset (mbcset);
				3330	#endif /* RE_ENABLE_I18N */
				3331	return NULL;
				3332	}
				3333
				3334	/* Parse an element in the bracket expression. */
				3335
				3336	static reg_errcode_t
				3337	parse_bracket_element (bracket_elem_t elem, re_string_t regexp,
				3338	re_token_t token, int token_len, re_dfa_t dfa,
				3339	reg_syntax_t syntax, int accept_hyphen)
				3340	{
				3341	#ifdef RE_ENABLE_I18N
				3342	int cur_char_size;
				3343	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
				3344	if (cur_char_size > 1)
				3345	{
				3346	elem->type = MB_CHAR;
				3347	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
				3348	re_string_skip_bytes (regexp, cur_char_size);
				3349	return REG_NOERROR;
				3350	}
				3351	#endif /* RE_ENABLE_I18N */
				3352	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
				3353	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
				3354	\|\| token->type == OP_OPEN_EQUIV_CLASS)
				3355	return parse_bracket_symbol (elem, regexp, token);
				3356	if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
				3357	{
				3358	/* A '-' must only appear as anything but a range indicator before
				3359	the closing bracket. Everything else is an error. */
				3360	re_token_t token2;
				3361	(void) peek_token_bracket (&token2, regexp, syntax);
				3362	if (token2.type != OP_CLOSE_BRACKET)
				3363	/* The actual error value is not standardized since this whole
				3364	case is undefined. But ERANGE makes good sense. */
				3365	return REG_ERANGE;
				3366	}
				3367	elem->type = SB_CHAR;
				3368	elem->opr.ch = token->opr.c;
				3369	return REG_NOERROR;
				3370	}
				3371
				3372	/* Parse a bracket symbol in the bracket expression. Bracket symbols are
				3373	such as [:<character_class>:], [.<collating_element>.], and
				3374	[=<equivalent_class>=]. */
				3375
				3376	static reg_errcode_t
				3377	parse_bracket_symbol (bracket_elem_t elem, re_string_t regexp,
				3378	re_token_t *token)
				3379	{
				3380	unsigned char ch, delim = token->opr.c;
				3381	int i = 0;
				3382	if (re_string_eoi(regexp))
				3383	return REG_EBRACK;
				3384	for (;; ++i)
				3385	{
				3386	if (i >= BRACKET_NAME_BUF_SIZE)
				3387	return REG_EBRACK;
				3388	if (token->type == OP_OPEN_CHAR_CLASS)
				3389	ch = re_string_fetch_byte_case (regexp);
				3390	else
				3391	ch = re_string_fetch_byte (regexp);
				3392	if (re_string_eoi(regexp))
				3393	return REG_EBRACK;
				3394	if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
				3395	break;
				3396	elem->opr.name[i] = ch;
				3397	}
				3398	re_string_skip_bytes (regexp, 1);
				3399	elem->opr.name[i] = '\0';
				3400	switch (token->type)
				3401	{
				3402	case OP_OPEN_COLL_ELEM:
				3403	elem->type = COLL_SYM;
				3404	break;
				3405	case OP_OPEN_EQUIV_CLASS:
				3406	elem->type = EQUIV_CLASS;
				3407	break;
				3408	case OP_OPEN_CHAR_CLASS:
				3409	elem->type = CHAR_CLASS;
				3410	break;
				3411	default:
				3412	break;
				3413	}
				3414	return REG_NOERROR;
				3415	}
				3416
				3417	/* Helper function for parse_bracket_exp.
				3418	Build the equivalence class which is represented by NAME.
				3419	The result are written to MBCSET and SBCSET.
				3420	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
Ondřej Bílka	749f763	2013-07-22 23:02:23 +0200	[diff] [blame]	3421	is a pointer argument since we may update it. */
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	3422
				3423	static reg_errcode_t
				3424	#ifdef RE_ENABLE_I18N
				3425	build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
				3426	int equiv_class_alloc, const unsigned char name)
				3427	#else /* not RE_ENABLE_I18N */
				3428	build_equiv_class (bitset_t sbcset, const unsigned char *name)
				3429	#endif /* not RE_ENABLE_I18N */
				3430	{
				3431	#ifdef _LIBC
				3432	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
				3433	if (nrules != 0)
				3434	{
				3435	const int32_t table, indirect;
				3436	const unsigned char weights, extra, *cp;
				3437	unsigned char char_buf[2];
				3438	int32_t idx1, idx2;
				3439	unsigned int ch;
				3440	size_t len;
				3441	/* This #include defines a local function! */
				3442	# include <locale/weight.h>
				3443	/* Calculate the index for equivalence class. */
				3444	cp = name;
				3445	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
				3446	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
				3447	_NL_COLLATE_WEIGHTMB);
				3448	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
				3449	_NL_COLLATE_EXTRAMB);
				3450	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
				3451	_NL_COLLATE_INDIRECTMB);
				3452	idx1 = findidx (&cp);
				3453	if (BE (idx1 == 0 \|\| cp < name + strlen ((const char *) name), 0))
				3454	/* This isn't a valid character. */
				3455	return REG_ECOLLATE;
				3456
				3457	/* Build single byte matcing table for this equivalence class. */
				3458	char_buf[1] = (unsigned char) '\0';
				3459	len = weights[idx1 & 0xffffff];
				3460	for (ch = 0; ch < SBC_MAX; ++ch)
				3461	{
				3462	char_buf[0] = ch;
				3463	cp = char_buf;
				3464	idx2 = findidx (&cp);
				3465	/*
				3466	idx2 = table[ch];
				3467	*/
				3468	if (idx2 == 0)
				3469	/* This isn't a valid character. */
				3470	continue;
				3471	/* Compare only if the length matches and the collation rule
				3472	index is the same. */
				3473	if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
				3474	{
				3475	int cnt = 0;
				3476
				3477	while (cnt <= len &&
				3478	weights[(idx1 & 0xffffff) + 1 + cnt]
				3479	== weights[(idx2 & 0xffffff) + 1 + cnt])
				3480	++cnt;
				3481
				3482	if (cnt > len)
				3483	bitset_set (sbcset, ch);
				3484	}
				3485	}
				3486	/* Check whether the array has enough space. */
				3487	if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
				3488	{
				3489	/* Not enough, realloc it. */
				3490	/* +1 in case of mbcset->nequiv_classes is 0. */
				3491	int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
				3492	/* Use realloc since the array is NULL if alloc == 0. /
				3493	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
				3494	int32_t,
				3495	new_equiv_class_alloc);
				3496	if (BE (new_equiv_classes == NULL, 0))
				3497	return REG_ESPACE;
				3498	mbcset->equiv_classes = new_equiv_classes;
				3499	*equiv_class_alloc = new_equiv_class_alloc;
				3500	}
				3501	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
				3502	}
				3503	else
				3504	#endif /* _LIBC */
				3505	{
				3506	if (BE (strlen ((const char *) name) != 1, 0))
				3507	return REG_ECOLLATE;
				3508	bitset_set (sbcset, *name);
				3509	}
				3510	return REG_NOERROR;
				3511	}
				3512
				3513	/* Helper function for parse_bracket_exp.
				3514	Build the character class which is represented by NAME.
				3515	The result are written to MBCSET and SBCSET.
				3516	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
Ondřej Bílka	749f763	2013-07-22 23:02:23 +0200	[diff] [blame]	3517	is a pointer argument since we may update it. */
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	3518
				3519	static reg_errcode_t
				3520	#ifdef RE_ENABLE_I18N
				3521	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
				3522	re_charset_t mbcset, int char_class_alloc,
				3523	const char *class_name, reg_syntax_t syntax)
				3524	#else /* not RE_ENABLE_I18N */
				3525	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
				3526	const char *class_name, reg_syntax_t syntax)
				3527	#endif /* not RE_ENABLE_I18N */
				3528	{
				3529	int i;
				3530
				3531	/* In case of REG_ICASE "upper" and "lower" match the both of
				3532	upper and lower cases. */
				3533	if ((syntax & RE_ICASE)
				3534	&& (strcmp (class_name, "upper") == 0 \|\| strcmp (class_name, "lower") == 0))
				3535	class_name = "alpha";
				3536
				3537	#ifdef RE_ENABLE_I18N
				3538	/* Check the space of the arrays. */
				3539	if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
				3540	{
				3541	/* Not enough, realloc it. */
				3542	/* +1 in case of mbcset->nchar_classes is 0. */
				3543	int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
				3544	/* Use realloc since array is NULL if alloc == 0. /
				3545	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
				3546	new_char_class_alloc);
				3547	if (BE (new_char_classes == NULL, 0))
				3548	return REG_ESPACE;
				3549	mbcset->char_classes = new_char_classes;
				3550	*char_class_alloc = new_char_class_alloc;
				3551	}
				3552	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
				3553	#endif /* RE_ENABLE_I18N */
				3554
				3555	#define BUILD_CHARCLASS_LOOP(ctype_func) \
				3556	do { \
				3557	if (BE (trans != NULL, 0)) \
				3558	{ \
				3559	for (i = 0; i < SBC_MAX; ++i) \
				3560	if (ctype_func (i)) \
				3561	bitset_set (sbcset, trans[i]); \
				3562	} \
				3563	else \
				3564	{ \
				3565	for (i = 0; i < SBC_MAX; ++i) \
				3566	if (ctype_func (i)) \
				3567	bitset_set (sbcset, i); \
				3568	} \
				3569	} while (0)
				3570
				3571	if (strcmp (class_name, "alnum") == 0)
				3572	BUILD_CHARCLASS_LOOP (isalnum);
				3573	else if (strcmp (class_name, "cntrl") == 0)
				3574	BUILD_CHARCLASS_LOOP (iscntrl);
				3575	else if (strcmp (class_name, "lower") == 0)
				3576	BUILD_CHARCLASS_LOOP (islower);
				3577	else if (strcmp (class_name, "space") == 0)
				3578	BUILD_CHARCLASS_LOOP (isspace);
				3579	else if (strcmp (class_name, "alpha") == 0)
				3580	BUILD_CHARCLASS_LOOP (isalpha);
				3581	else if (strcmp (class_name, "digit") == 0)
				3582	BUILD_CHARCLASS_LOOP (isdigit);
				3583	else if (strcmp (class_name, "print") == 0)
				3584	BUILD_CHARCLASS_LOOP (isprint);
				3585	else if (strcmp (class_name, "upper") == 0)
				3586	BUILD_CHARCLASS_LOOP (isupper);
				3587	else if (strcmp (class_name, "blank") == 0)
				3588	#ifndef GAWK
				3589	BUILD_CHARCLASS_LOOP (isblank);
				3590	#else
				3591	/* see comments above */
				3592	BUILD_CHARCLASS_LOOP (is_blank);
				3593	#endif
				3594	else if (strcmp (class_name, "graph") == 0)
				3595	BUILD_CHARCLASS_LOOP (isgraph);
				3596	else if (strcmp (class_name, "punct") == 0)
				3597	BUILD_CHARCLASS_LOOP (ispunct);
				3598	else if (strcmp (class_name, "xdigit") == 0)
				3599	BUILD_CHARCLASS_LOOP (isxdigit);
				3600	else
				3601	return REG_ECTYPE;
				3602
				3603	return REG_NOERROR;
				3604	}
				3605
				3606	static bin_tree_t *
				3607	build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
				3608	const char *class_name,
				3609	const char *extra, int non_match,
				3610	reg_errcode_t *err)
				3611	{
				3612	re_bitset_ptr_t sbcset;
				3613	#ifdef RE_ENABLE_I18N
				3614	re_charset_t *mbcset;
				3615	int alloc = 0;
				3616	#endif /* not RE_ENABLE_I18N */
				3617	reg_errcode_t ret;
				3618	re_token_t br_token;
				3619	bin_tree_t *tree;
				3620
				3621	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
				3622	#ifdef RE_ENABLE_I18N
				3623	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
				3624	#endif /* RE_ENABLE_I18N */
				3625
				3626	#ifdef RE_ENABLE_I18N
				3627	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
				3628	#else /* not RE_ENABLE_I18N */
				3629	if (BE (sbcset == NULL, 0))
				3630	#endif /* not RE_ENABLE_I18N */
				3631	{
				3632	*err = REG_ESPACE;
				3633	return NULL;
				3634	}
				3635
				3636	if (non_match)
				3637	{
				3638	#ifdef RE_ENABLE_I18N
				3639	mbcset->non_match = 1;
				3640	#endif /* not RE_ENABLE_I18N */
				3641	}
				3642
				3643	/* We don't care the syntax in this case. */
				3644	ret = build_charclass (trans, sbcset,
				3645	#ifdef RE_ENABLE_I18N
				3646	mbcset, &alloc,
				3647	#endif /* RE_ENABLE_I18N */
				3648	class_name, 0);
				3649
				3650	if (BE (ret != REG_NOERROR, 0))
				3651	{
				3652	re_free (sbcset);
				3653	#ifdef RE_ENABLE_I18N
				3654	free_charset (mbcset);
				3655	#endif /* RE_ENABLE_I18N */
				3656	*err = ret;
				3657	return NULL;
				3658	}
				3659	/* \w match '_' also. */
				3660	for (; *extra; extra++)
				3661	bitset_set (sbcset, *extra);
				3662
				3663	/* If it is non-matching list. */
				3664	if (non_match)
				3665	bitset_not (sbcset);
				3666
				3667	#ifdef RE_ENABLE_I18N
				3668	/* Ensure only single byte characters are set. */
				3669	if (dfa->mb_cur_max > 1)
				3670	bitset_mask (sbcset, dfa->sb_char);
				3671	#endif
				3672
				3673	/* Build a tree for simple bracket. */
				3674	br_token.type = SIMPLE_BRACKET;
				3675	br_token.opr.sbcset = sbcset;
				3676	tree = create_token_tree (dfa, NULL, NULL, &br_token);
				3677	if (BE (tree == NULL, 0))
				3678	goto build_word_op_espace;
				3679
				3680	#ifdef RE_ENABLE_I18N
				3681	if (dfa->mb_cur_max > 1)
				3682	{
				3683	bin_tree_t *mbc_tree;
				3684	/* Build a tree for complex bracket. */
				3685	br_token.type = COMPLEX_BRACKET;
				3686	br_token.opr.mbcset = mbcset;
				3687	dfa->has_mb_node = 1;
				3688	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
				3689	if (BE (mbc_tree == NULL, 0))
				3690	goto build_word_op_espace;
				3691	/* Then join them by ALT node. */
				3692	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
				3693	if (BE (mbc_tree != NULL, 1))
				3694	return tree;
				3695	}
				3696	else
				3697	{
				3698	free_charset (mbcset);
				3699	return tree;
				3700	}
				3701	#else /* not RE_ENABLE_I18N */
				3702	return tree;
				3703	#endif /* not RE_ENABLE_I18N */
				3704
				3705	build_word_op_espace:
				3706	re_free (sbcset);
				3707	#ifdef RE_ENABLE_I18N
				3708	free_charset (mbcset);
				3709	#endif /* RE_ENABLE_I18N */
				3710	*err = REG_ESPACE;
				3711	return NULL;
				3712	}
				3713
				3714	/* This is intended for the expressions like "a{1,3}".
				3715	Fetch a number from `input', and return the number.
				3716	Return -1, if the number field is empty like "{,1}".
Stefano Lattarini	ce9171c	2013-04-12 00:36:10 +0200	[diff] [blame]	3717	Return -2, if an error has occurred. */
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	3718
				3719	static int
				3720	fetch_number (re_string_t input, re_token_t token, reg_syntax_t syntax)
				3721	{
				3722	int num = -1;
				3723	unsigned char c;
				3724	while (1)
				3725	{
				3726	fetch_token (token, input, syntax);
				3727	c = token->opr.c;
				3728	if (BE (token->type == END_OF_RE, 0))
				3729	return -2;
				3730	if (token->type == OP_CLOSE_DUP_NUM \|\| c == ',')
				3731	break;
				3732	num = ((token->type != CHARACTER \|\| c < '0' \|\| '9' < c \|\| num == -2)
				3733	? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
				3734	num = (num > RE_DUP_MAX) ? -2 : num;
				3735	}
				3736	return num;
				3737	}
				3738
				3739	#ifdef RE_ENABLE_I18N
				3740	static void
				3741	free_charset (re_charset_t *cset)
				3742	{
				3743	re_free (cset->mbchars);
				3744	# ifdef _LIBC
				3745	re_free (cset->coll_syms);
				3746	re_free (cset->equiv_classes);
				3747	re_free (cset->range_starts);
				3748	re_free (cset->range_ends);
				3749	# endif
				3750	re_free (cset->char_classes);
				3751	re_free (cset);
				3752	}
				3753	#endif /* RE_ENABLE_I18N */
				3754
				3755	/* Functions for binary tree operation. */
				3756
				3757	/* Create a tree node. */
				3758
				3759	static bin_tree_t *
				3760	create_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
				3761	re_token_type_t type)
				3762	{
				3763	re_token_t t;
				3764	t.type = type;
				3765	return create_token_tree (dfa, left, right, &t);
				3766	}
				3767
				3768	static bin_tree_t *
				3769	create_token_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
				3770	const re_token_t *token)
				3771	{
				3772	bin_tree_t *tree;
				3773	if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
				3774	{
				3775	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
				3776
				3777	if (storage == NULL)
				3778	return NULL;
				3779	storage->next = dfa->str_tree_storage;
				3780	dfa->str_tree_storage = storage;
				3781	dfa->str_tree_storage_idx = 0;
				3782	}
				3783	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
				3784
				3785	tree->parent = NULL;
				3786	tree->left = left;
				3787	tree->right = right;
				3788	tree->token = *token;
				3789	tree->token.duplicated = 0;
				3790	tree->token.opt_subexp = 0;
				3791	tree->first = NULL;
				3792	tree->next = NULL;
				3793	tree->node_idx = -1;
				3794
				3795	if (left != NULL)
				3796	left->parent = tree;
				3797	if (right != NULL)
				3798	right->parent = tree;
				3799	return tree;
				3800	}
				3801
				3802	/* Mark the tree SRC as an optional subexpression.
				3803	To be called from preorder or postorder. */
				3804
				3805	static reg_errcode_t
				3806	mark_opt_subexp (void extra, bin_tree_t node)
				3807	{
Johannes Schindelin	56a1a3a	2015-10-26 14:15:25 +0100	[diff] [blame]	3808	int idx = (int) (intptr_t) extra;
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	3809	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
				3810	node->token.opt_subexp = 1;
				3811
				3812	return REG_NOERROR;
				3813	}
				3814
				3815	/* Free the allocated memory inside NODE. */
				3816
				3817	static void
				3818	free_token (re_token_t *node)
				3819	{
				3820	#ifdef RE_ENABLE_I18N
				3821	if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
				3822	free_charset (node->opr.mbcset);
				3823	else
				3824	#endif /* RE_ENABLE_I18N */
				3825	if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
				3826	re_free (node->opr.sbcset);
				3827	}
				3828
				3829	/* Worker function for tree walking. Free the allocated memory inside NODE
				3830	and its children. */
				3831
				3832	static reg_errcode_t
				3833	free_tree (void extra, bin_tree_t node)
				3834	{
				3835	free_token (&node->token);
				3836	return REG_NOERROR;
				3837	}
				3838
				3839
				3840	/* Duplicate the node SRC, and return new node. This is a preorder
				3841	visit similar to the one implemented by the generic visitor, but
				3842	we need more infrastructure to maintain two parallel trees --- so,
				3843	it's easier to duplicate. */
				3844
				3845	static bin_tree_t *
				3846	duplicate_tree (const bin_tree_t root, re_dfa_t dfa)
				3847	{
				3848	const bin_tree_t *node;
				3849	bin_tree_t *dup_root;
				3850	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
				3851
				3852	for (node = root; ; )
				3853	{
				3854	/* Create a new tree and link it back to the current parent. */
				3855	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
				3856	if (*p_new == NULL)
				3857	return NULL;
				3858	(*p_new)->parent = dup_node;
				3859	(*p_new)->token.duplicated = 1;
				3860	dup_node = *p_new;
				3861
				3862	/* Go to the left node, or up and to the right. */
				3863	if (node->left)
				3864	{
				3865	node = node->left;
				3866	p_new = &dup_node->left;
				3867	}
				3868	else
				3869	{
				3870	const bin_tree_t *prev = NULL;
				3871	while (node->right == prev \|\| node->right == NULL)
				3872	{
				3873	prev = node;
				3874	node = node->parent;
				3875	dup_node = dup_node->parent;
				3876	if (!node)
				3877	return dup_root;
				3878	}
				3879	node = node->right;
				3880	p_new = &dup_node->right;
				3881	}
				3882	}
				3883	}