Blame - compat/regex/regex_internal.c - jrn/git

blob: ec51cf34461ef2eeafea7087b2abe9cafc260932 [file] [log] [blame]

Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	1	/* Extended regular expression matching and search library.
				2	Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
				3	This file is part of the GNU C Library.
				4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
				5
				6	The GNU C Library is free software; you can redistribute it and/or
				7	modify it under the terms of the GNU Lesser General Public
				8	License as published by the Free Software Foundation; either
				9	version 2.1 of the License, or (at your option) any later version.
				10
				11	The GNU C Library is distributed in the hope that it will be useful,
				12	but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				14	Lesser General Public License for more details.
				15
				16	You should have received a copy of the GNU Lesser General Public
Todd Zullinger	4842579	2017-11-07 00:39:33 -0500	[diff] [blame]	17	License along with the GNU C Library; if not, see
				18	<http://www.gnu.org/licenses/>. */
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	19
				20	static void re_string_construct_common (const char *str, int len,
				21	re_string_t *pstr,
				22	RE_TRANSLATE_TYPE trans, int icase,
				23	const re_dfa_t *dfa) internal_function;
				24	static re_dfastate_t create_ci_newstate (const re_dfa_t dfa,
				25	const re_node_set *nodes,
				26	unsigned int hash) internal_function;
				27	static re_dfastate_t create_cd_newstate (const re_dfa_t dfa,
				28	const re_node_set *nodes,
				29	unsigned int context,
				30	unsigned int hash) internal_function;
				31
				32	#ifdef GAWK
				33	#undef MAX /* safety */
				34	static int
				35	MAX(size_t a, size_t b)
				36	{
				37	return (a > b ? a : b);
				38	}
				39	#endif
				40
				41	/* Functions for string operation. */
				42
				43	/* This function allocate the buffers. It is necessary to call
				44	re_string_reconstruct before using the object. */
				45
				46	static reg_errcode_t
				47	internal_function
				48	re_string_allocate (re_string_t pstr, const char str, int len, int init_len,
				49	RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
				50	{
				51	reg_errcode_t ret;
				52	int init_buf_len;
				53
				54	/* Ensure at least one character fits into the buffers. */
				55	if (init_len < dfa->mb_cur_max)
				56	init_len = dfa->mb_cur_max;
				57	init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
				58	re_string_construct_common (str, len, pstr, trans, icase, dfa);
				59
				60	ret = re_string_realloc_buffers (pstr, init_buf_len);
				61	if (BE (ret != REG_NOERROR, 0))
				62	return ret;
				63
				64	pstr->word_char = dfa->word_char;
				65	pstr->word_ops_used = dfa->word_ops_used;
				66	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
				67	pstr->valid_len = (pstr->mbs_allocated \|\| dfa->mb_cur_max > 1) ? 0 : len;
				68	pstr->valid_raw_len = pstr->valid_len;
				69	return REG_NOERROR;
				70	}
				71
				72	/* This function allocate the buffers, and initialize them. */
				73
				74	static reg_errcode_t
				75	internal_function
				76	re_string_construct (re_string_t pstr, const char str, int len,
				77	RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
				78	{
				79	reg_errcode_t ret;
				80	memset (pstr, '\0', sizeof (re_string_t));
				81	re_string_construct_common (str, len, pstr, trans, icase, dfa);
				82
				83	if (len > 0)
				84	{
				85	ret = re_string_realloc_buffers (pstr, len + 1);
				86	if (BE (ret != REG_NOERROR, 0))
				87	return ret;
				88	}
				89	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
				90
				91	if (icase)
				92	{
				93	#ifdef RE_ENABLE_I18N
				94	if (dfa->mb_cur_max > 1)
				95	{
				96	while (1)
				97	{
				98	ret = build_wcs_upper_buffer (pstr);
				99	if (BE (ret != REG_NOERROR, 0))
				100	return ret;
				101	if (pstr->valid_raw_len >= len)
				102	break;
				103	if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
				104	break;
				105	ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
				106	if (BE (ret != REG_NOERROR, 0))
				107	return ret;
				108	}
				109	}
				110	else
				111	#endif /* RE_ENABLE_I18N */
				112	build_upper_buffer (pstr);
				113	}
				114	else
				115	{
				116	#ifdef RE_ENABLE_I18N
				117	if (dfa->mb_cur_max > 1)
				118	build_wcs_buffer (pstr);
				119	else
				120	#endif /* RE_ENABLE_I18N */
				121	{
				122	if (trans != NULL)
				123	re_string_translate_buffer (pstr);
				124	else
				125	{
				126	pstr->valid_len = pstr->bufs_len;
				127	pstr->valid_raw_len = pstr->bufs_len;
				128	}
				129	}
				130	}
				131
				132	return REG_NOERROR;
				133	}
				134
				135	/* Helper functions for re_string_allocate, and re_string_construct. */
				136
				137	static reg_errcode_t
				138	internal_function
				139	re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
				140	{
				141	#ifdef RE_ENABLE_I18N
				142	if (pstr->mb_cur_max > 1)
				143	{
				144	wint_t *new_wcs;
				145
				146	/* Avoid overflow in realloc. */
				147	const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
				148	if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
				149	return REG_ESPACE;
				150
				151	new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
				152	if (BE (new_wcs == NULL, 0))
				153	return REG_ESPACE;
				154	pstr->wcs = new_wcs;
				155	if (pstr->offsets != NULL)
				156	{
				157	int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
				158	if (BE (new_offsets == NULL, 0))
				159	return REG_ESPACE;
				160	pstr->offsets = new_offsets;
				161	}
				162	}
				163	#endif /* RE_ENABLE_I18N */
				164	if (pstr->mbs_allocated)
				165	{
				166	unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
				167	new_buf_len);
				168	if (BE (new_mbs == NULL, 0))
				169	return REG_ESPACE;
				170	pstr->mbs = new_mbs;
				171	}
				172	pstr->bufs_len = new_buf_len;
				173	return REG_NOERROR;
				174	}
				175
				176
				177	static void
				178	internal_function
				179	re_string_construct_common (const char str, int len, re_string_t pstr,
				180	RE_TRANSLATE_TYPE trans, int icase,
				181	const re_dfa_t *dfa)
				182	{
				183	pstr->raw_mbs = (const unsigned char *) str;
				184	pstr->len = len;
				185	pstr->raw_len = len;
				186	pstr->trans = trans;
				187	pstr->icase = icase ? 1 : 0;
				188	pstr->mbs_allocated = (trans != NULL \|\| icase);
				189	pstr->mb_cur_max = dfa->mb_cur_max;
				190	pstr->is_utf8 = dfa->is_utf8;
				191	pstr->map_notascii = dfa->map_notascii;
				192	pstr->stop = pstr->len;
				193	pstr->raw_stop = pstr->stop;
				194	}
				195
				196	#ifdef RE_ENABLE_I18N
				197
				198	/* Build wide character buffer PSTR->WCS.
				199	If the byte sequence of the string are:
				200	<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
				201	Then wide character buffer will be:
				202	<wc1> , WEOF , <wc2> , WEOF , <wc3>
				203	We use WEOF for padding, they indicate that the position isn't
				204	a first byte of a multibyte character.
				205
				206	Note that this function assumes PSTR->VALID_LEN elements are already
				207	built and starts from PSTR->VALID_LEN. */
				208
				209	static void
				210	internal_function
				211	build_wcs_buffer (re_string_t *pstr)
				212	{
				213	#ifdef _LIBC
				214	unsigned char buf[MB_LEN_MAX];
				215	assert (MB_LEN_MAX >= pstr->mb_cur_max);
				216	#else
				217	unsigned char buf[64];
				218	#endif
				219	mbstate_t prev_st;
				220	int byte_idx, end_idx, remain_len;
				221	size_t mbclen;
				222
				223	/* Build the buffers from pstr->valid_len to either pstr->len or
				224	pstr->bufs_len. */
				225	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
				226	for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
				227	{
				228	wchar_t wc;
				229	const char *p;
				230
				231	remain_len = end_idx - byte_idx;
				232	prev_st = pstr->cur_state;
				233	/* Apply the translation if we need. */
				234	if (BE (pstr->trans != NULL, 0))
				235	{
				236	int i, ch;
				237
				238	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
				239	{
				240	ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
				241	buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
				242	}
				243	p = (const char *) buf;
				244	}
				245	else
				246	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
				247	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
				248	if (BE (mbclen == (size_t) -2, 0))
				249	{
				250	/* The buffer doesn't have enough space, finish to build. */
				251	pstr->cur_state = prev_st;
				252	break;
				253	}
				254	else if (BE (mbclen == (size_t) -1 \|\| mbclen == 0, 0))
				255	{
				256	/* We treat these cases as a singlebyte character. */
				257	mbclen = 1;
				258	wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
				259	if (BE (pstr->trans != NULL, 0))
				260	wc = pstr->trans[wc];
				261	pstr->cur_state = prev_st;
				262	}
				263
				264	/* Write wide character and padding. */
				265	pstr->wcs[byte_idx++] = wc;
				266	/* Write paddings. */
				267	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
				268	pstr->wcs[byte_idx++] = WEOF;
				269	}
				270	pstr->valid_len = byte_idx;
				271	pstr->valid_raw_len = byte_idx;
				272	}
				273
				274	/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
				275	but for REG_ICASE. */
				276
				277	static reg_errcode_t
				278	internal_function
				279	build_wcs_upper_buffer (re_string_t *pstr)
				280	{
				281	mbstate_t prev_st;
				282	int src_idx, byte_idx, end_idx, remain_len;
				283	size_t mbclen;
				284	#ifdef _LIBC
				285	char buf[MB_LEN_MAX];
				286	assert (MB_LEN_MAX >= pstr->mb_cur_max);
				287	#else
				288	char buf[64];
				289	#endif
				290
				291	byte_idx = pstr->valid_len;
				292	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
				293
				294	/* The following optimization assumes that ASCII characters can be
				295	mapped to wide characters with a simple cast. */
				296	if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
				297	{
				298	while (byte_idx < end_idx)
				299	{
				300	wchar_t wc;
				301
				302	if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
				303	&& mbsinit (&pstr->cur_state))
				304	{
				305	/* In case of a singlebyte character. */
				306	pstr->mbs[byte_idx]
				307	= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
				308	/* The next step uses the assumption that wchar_t is encoded
				309	ASCII-safe: all ASCII values can be converted like this. */
				310	pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
				311	++byte_idx;
				312	continue;
				313	}
				314
				315	remain_len = end_idx - byte_idx;
				316	prev_st = pstr->cur_state;
				317	mbclen = __mbrtowc (&wc,
				318	((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
				319	+ byte_idx), remain_len, &pstr->cur_state);
				320	if (BE (mbclen + 2 > 2, 1))
				321	{
				322	wchar_t wcu = wc;
				323	if (iswlower (wc))
				324	{
				325	size_t mbcdlen;
				326
				327	wcu = towupper (wc);
				328	mbcdlen = wcrtomb (buf, wcu, &prev_st);
				329	if (BE (mbclen == mbcdlen, 1))
				330	memcpy (pstr->mbs + byte_idx, buf, mbclen);
				331	else
				332	{
				333	src_idx = byte_idx;
				334	goto offsets_needed;
				335	}
				336	}
				337	else
				338	memcpy (pstr->mbs + byte_idx,
				339	pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
				340	pstr->wcs[byte_idx++] = wcu;
				341	/* Write paddings. */
				342	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
				343	pstr->wcs[byte_idx++] = WEOF;
				344	}
				345	else if (mbclen == (size_t) -1 \|\| mbclen == 0)
				346	{
				347	/* It is an invalid character or '\0'. Just use the byte. */
				348	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
				349	pstr->mbs[byte_idx] = ch;
				350	/* And also cast it to wide char. */
				351	pstr->wcs[byte_idx++] = (wchar_t) ch;
				352	if (BE (mbclen == (size_t) -1, 0))
				353	pstr->cur_state = prev_st;
				354	}
				355	else
				356	{
				357	/* The buffer doesn't have enough space, finish to build. */
				358	pstr->cur_state = prev_st;
				359	break;
				360	}
				361	}
				362	pstr->valid_len = byte_idx;
				363	pstr->valid_raw_len = byte_idx;
				364	return REG_NOERROR;
				365	}
				366	else
				367	for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
				368	{
				369	wchar_t wc;
				370	const char *p;
				371	offsets_needed:
				372	remain_len = end_idx - byte_idx;
				373	prev_st = pstr->cur_state;
				374	if (BE (pstr->trans != NULL, 0))
				375	{
				376	int i, ch;
				377
				378	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
				379	{
				380	ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
				381	buf[i] = pstr->trans[ch];
				382	}
				383	p = (const char *) buf;
				384	}
				385	else
				386	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
				387	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
				388	if (BE (mbclen + 2 > 2, 1))
				389	{
				390	wchar_t wcu = wc;
				391	if (iswlower (wc))
				392	{
				393	size_t mbcdlen;
				394
				395	wcu = towupper (wc);
				396	mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
				397	if (BE (mbclen == mbcdlen, 1))
				398	memcpy (pstr->mbs + byte_idx, buf, mbclen);
				399	else if (mbcdlen != (size_t) -1)
				400	{
				401	size_t i;
				402
				403	if (byte_idx + mbcdlen > pstr->bufs_len)
				404	{
				405	pstr->cur_state = prev_st;
				406	break;
				407	}
				408
				409	if (pstr->offsets == NULL)
				410	{
				411	pstr->offsets = re_malloc (int, pstr->bufs_len);
				412
				413	if (pstr->offsets == NULL)
				414	return REG_ESPACE;
				415	}
				416	if (!pstr->offsets_needed)
				417	{
				418	for (i = 0; i < (size_t) byte_idx; ++i)
				419	pstr->offsets[i] = i;
				420	pstr->offsets_needed = 1;
				421	}
				422
				423	memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
				424	pstr->wcs[byte_idx] = wcu;
				425	pstr->offsets[byte_idx] = src_idx;
				426	for (i = 1; i < mbcdlen; ++i)
				427	{
				428	pstr->offsets[byte_idx + i]
				429	= src_idx + (i < mbclen ? i : mbclen - 1);
				430	pstr->wcs[byte_idx + i] = WEOF;
				431	}
				432	pstr->len += mbcdlen - mbclen;
				433	if (pstr->raw_stop > src_idx)
				434	pstr->stop += mbcdlen - mbclen;
				435	end_idx = (pstr->bufs_len > pstr->len)
				436	? pstr->len : pstr->bufs_len;
				437	byte_idx += mbcdlen;
				438	src_idx += mbclen;
				439	continue;
				440	}
				441	else
				442	memcpy (pstr->mbs + byte_idx, p, mbclen);
				443	}
				444	else
				445	memcpy (pstr->mbs + byte_idx, p, mbclen);
				446
				447	if (BE (pstr->offsets_needed != 0, 0))
				448	{
				449	size_t i;
				450	for (i = 0; i < mbclen; ++i)
				451	pstr->offsets[byte_idx + i] = src_idx + i;
				452	}
				453	src_idx += mbclen;
				454
				455	pstr->wcs[byte_idx++] = wcu;
				456	/* Write paddings. */
				457	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
				458	pstr->wcs[byte_idx++] = WEOF;
				459	}
				460	else if (mbclen == (size_t) -1 \|\| mbclen == 0)
				461	{
				462	/* It is an invalid character or '\0'. Just use the byte. */
				463	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
				464
				465	if (BE (pstr->trans != NULL, 0))
				466	ch = pstr->trans [ch];
				467	pstr->mbs[byte_idx] = ch;
				468
				469	if (BE (pstr->offsets_needed != 0, 0))
				470	pstr->offsets[byte_idx] = src_idx;
				471	++src_idx;
				472
				473	/* And also cast it to wide char. */
				474	pstr->wcs[byte_idx++] = (wchar_t) ch;
				475	if (BE (mbclen == (size_t) -1, 0))
				476	pstr->cur_state = prev_st;
				477	}
				478	else
				479	{
				480	/* The buffer doesn't have enough space, finish to build. */
				481	pstr->cur_state = prev_st;
				482	break;
				483	}
				484	}
				485	pstr->valid_len = byte_idx;
				486	pstr->valid_raw_len = src_idx;
				487	return REG_NOERROR;
				488	}
				489
				490	/* Skip characters until the index becomes greater than NEW_RAW_IDX.
				491	Return the index. */
				492
				493	static int
				494	internal_function
				495	re_string_skip_chars (re_string_t pstr, int new_raw_idx, wint_t last_wc)
				496	{
				497	mbstate_t prev_st;
				498	int rawbuf_idx;
				499	size_t mbclen;
				500	wint_t wc = WEOF;
				501
				502	/* Skip the characters which are not necessary to check. */
				503	for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
				504	rawbuf_idx < new_raw_idx;)
				505	{
				506	wchar_t wc2;
				507	int remain_len = pstr->len - rawbuf_idx;
				508	prev_st = pstr->cur_state;
				509	mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
				510	remain_len, &pstr->cur_state);
				511	if (BE (mbclen == (size_t) -2 \|\| mbclen == (size_t) -1 \|\| mbclen == 0, 0))
				512	{
				513	/* We treat these cases as a single byte character. */
				514	if (mbclen == 0 \|\| remain_len == 0)
				515	wc = L'\0';
				516	else
				517	wc = (unsigned char ) (pstr->raw_mbs + rawbuf_idx);
				518	mbclen = 1;
				519	pstr->cur_state = prev_st;
				520	}
				521	else
				522	wc = (wint_t) wc2;
				523	/* Then proceed the next character. */
				524	rawbuf_idx += mbclen;
				525	}
				526	*last_wc = (wint_t) wc;
				527	return rawbuf_idx;
				528	}
				529	#endif /* RE_ENABLE_I18N */
				530
				531	/* Build the buffer PSTR->MBS, and apply the translation if we need.
				532	This function is used in case of REG_ICASE. */
				533
				534	static void
				535	internal_function
				536	build_upper_buffer (re_string_t *pstr)
				537	{
				538	int char_idx, end_idx;
				539	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
				540
				541	for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
				542	{
				543	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
				544	if (BE (pstr->trans != NULL, 0))
				545	ch = pstr->trans[ch];
				546	if (islower (ch))
				547	pstr->mbs[char_idx] = toupper (ch);
				548	else
				549	pstr->mbs[char_idx] = ch;
				550	}
				551	pstr->valid_len = char_idx;
				552	pstr->valid_raw_len = char_idx;
				553	}
				554
				555	/* Apply TRANS to the buffer in PSTR. */
				556
				557	static void
				558	internal_function
				559	re_string_translate_buffer (re_string_t *pstr)
				560	{
				561	int buf_idx, end_idx;
				562	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
				563
				564	for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
				565	{
				566	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
				567	pstr->mbs[buf_idx] = pstr->trans[ch];
				568	}
				569
				570	pstr->valid_len = buf_idx;
				571	pstr->valid_raw_len = buf_idx;
				572	}
				573
				574	/* This function re-construct the buffers.
				575	Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
				576	convert to upper case in case of REG_ICASE, apply translation. */
				577
				578	static reg_errcode_t
				579	internal_function
				580	re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
				581	{
				582	int offset = idx - pstr->raw_mbs_idx;
				583	if (BE (offset < 0, 0))
				584	{
				585	/* Reset buffer. */
				586	#ifdef RE_ENABLE_I18N
				587	if (pstr->mb_cur_max > 1)
				588	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
				589	#endif /* RE_ENABLE_I18N */
				590	pstr->len = pstr->raw_len;
				591	pstr->stop = pstr->raw_stop;
				592	pstr->valid_len = 0;
				593	pstr->raw_mbs_idx = 0;
				594	pstr->valid_raw_len = 0;
				595	pstr->offsets_needed = 0;
				596	pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
				597	: CONTEXT_NEWLINE \| CONTEXT_BEGBUF);
				598	if (!pstr->mbs_allocated)
				599	pstr->mbs = (unsigned char *) pstr->raw_mbs;
				600	offset = idx;
				601	}
				602
				603	if (BE (offset != 0, 1))
				604	{
				605	/* Should the already checked characters be kept? */
				606	if (BE (offset < pstr->valid_raw_len, 1))
				607	{
				608	/* Yes, move them to the front of the buffer. */
				609	#ifdef RE_ENABLE_I18N
				610	if (BE (pstr->offsets_needed, 0))
				611	{
				612	int low = 0, high = pstr->valid_len, mid;
				613	do
				614	{
Derrick Stolee	19716b2	2017-10-08 14:29:37 -0400	[diff] [blame]	615	mid = low + (high - low) / 2;
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	616	if (pstr->offsets[mid] > offset)
				617	high = mid;
				618	else if (pstr->offsets[mid] < offset)
				619	low = mid + 1;
				620	else
				621	break;
				622	}
				623	while (low < high);
				624	if (pstr->offsets[mid] < offset)
				625	++mid;
				626	pstr->tip_context = re_string_context_at (pstr, mid - 1,
				627	eflags);
				628	/* This can be quite complicated, so handle specially
				629	only the common and easy case where the character with
				630	different length representation of lower and upper
				631	case is present at or after offset. */
				632	if (pstr->valid_len > offset
				633	&& mid == offset && pstr->offsets[mid] == offset)
				634	{
				635	memmove (pstr->wcs, pstr->wcs + offset,
				636	(pstr->valid_len - offset) * sizeof (wint_t));
				637	memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
				638	pstr->valid_len -= offset;
				639	pstr->valid_raw_len -= offset;
				640	for (low = 0; low < pstr->valid_len; low++)
				641	pstr->offsets[low] = pstr->offsets[low + offset] - offset;
				642	}
				643	else
				644	{
				645	/* Otherwise, just find out how long the partial multibyte
				646	character at offset is and fill it with WEOF/255. */
				647	pstr->len = pstr->raw_len - idx + offset;
				648	pstr->stop = pstr->raw_stop - idx + offset;
				649	pstr->offsets_needed = 0;
				650	while (mid > 0 && pstr->offsets[mid - 1] == offset)
				651	--mid;
				652	while (mid < pstr->valid_len)
				653	if (pstr->wcs[mid] != WEOF)
				654	break;
				655	else
				656	++mid;
				657	if (mid == pstr->valid_len)
				658	pstr->valid_len = 0;
				659	else
				660	{
				661	pstr->valid_len = pstr->offsets[mid] - offset;
				662	if (pstr->valid_len)
				663	{
				664	for (low = 0; low < pstr->valid_len; ++low)
				665	pstr->wcs[low] = WEOF;
				666	memset (pstr->mbs, 255, pstr->valid_len);
				667	}
				668	}
				669	pstr->valid_raw_len = pstr->valid_len;
				670	}
				671	}
				672	else
				673	#endif
				674	{
				675	pstr->tip_context = re_string_context_at (pstr, offset - 1,
				676	eflags);
				677	#ifdef RE_ENABLE_I18N
				678	if (pstr->mb_cur_max > 1)
				679	memmove (pstr->wcs, pstr->wcs + offset,
				680	(pstr->valid_len - offset) * sizeof (wint_t));
				681	#endif /* RE_ENABLE_I18N */
				682	if (BE (pstr->mbs_allocated, 0))
				683	memmove (pstr->mbs, pstr->mbs + offset,
				684	pstr->valid_len - offset);
				685	pstr->valid_len -= offset;
				686	pstr->valid_raw_len -= offset;
				687	#if DEBUG
				688	assert (pstr->valid_len > 0);
				689	#endif
				690	}
				691	}
				692	else
				693	{
Ævar Arnfjörð Bjarmason	b50f370	2010-08-19 18:30:01 +0000	[diff] [blame]	694	#ifdef RE_ENABLE_I18N
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	695	/* No, skip all characters until IDX. */
				696	int prev_valid_len = pstr->valid_len;
				697
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	698	if (BE (pstr->offsets_needed, 0))
				699	{
				700	pstr->len = pstr->raw_len - idx + offset;
				701	pstr->stop = pstr->raw_stop - idx + offset;
				702	pstr->offsets_needed = 0;
				703	}
				704	#endif
				705	pstr->valid_len = 0;
				706	#ifdef RE_ENABLE_I18N
				707	if (pstr->mb_cur_max > 1)
				708	{
				709	int wcs_idx;
				710	wint_t wc = WEOF;
				711
				712	if (pstr->is_utf8)
				713	{
				714	const unsigned char raw, p, *end;
				715
				716	/* Special case UTF-8. Multi-byte chars start with any
				717	byte other than 0x80 - 0xbf. */
				718	raw = pstr->raw_mbs + pstr->raw_mbs_idx;
				719	end = raw + (offset - pstr->mb_cur_max);
				720	if (end < pstr->raw_mbs)
				721	end = pstr->raw_mbs;
				722	p = raw + offset - 1;
				723	#ifdef _LIBC
				724	/* We know the wchar_t encoding is UCS4, so for the simple
				725	case, ASCII characters, skip the conversion step. */
				726	if (isascii (*p) && BE (pstr->trans == NULL, 1))
				727	{
				728	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
				729	/* pstr->valid_len = 0; */
				730	wc = (wchar_t) *p;
				731	}
				732	else
				733	#endif
				734	for (; p >= end; --p)
				735	if ((*p & 0xc0) != 0x80)
				736	{
				737	mbstate_t cur_state;
				738	wchar_t wc2;
				739	int mlen = raw + pstr->len - p;
				740	unsigned char buf[6];
				741	size_t mbclen;
				742
				743	if (BE (pstr->trans != NULL, 0))
				744	{
				745	int i = mlen < 6 ? mlen : 6;
				746	while (--i >= 0)
				747	buf[i] = pstr->trans[p[i]];
				748	}
				749	/* XXX Don't use mbrtowc, we know which conversion
				750	to use (UTF-8 -> UCS4). */
				751	memset (&cur_state, 0, sizeof (cur_state));
				752	mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
				753	&cur_state);
				754	if (raw + offset - p <= mbclen
				755	&& mbclen < (size_t) -2)
				756	{
				757	memset (&pstr->cur_state, '\0',
				758	sizeof (mbstate_t));
				759	pstr->valid_len = mbclen - (raw + offset - p);
				760	wc = wc2;
				761	}
				762	break;
				763	}
				764	}
				765
				766	if (wc == WEOF)
				767	pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
				768	if (wc == WEOF)
				769	pstr->tip_context
				770	= re_string_context_at (pstr, prev_valid_len - 1, eflags);
				771	else
				772	pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
				773	&& IS_WIDE_WORD_CHAR (wc))
				774	? CONTEXT_WORD
				775	: ((IS_WIDE_NEWLINE (wc)
				776	&& pstr->newline_anchor)
				777	? CONTEXT_NEWLINE : 0));
				778	if (BE (pstr->valid_len, 0))
				779	{
				780	for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
				781	pstr->wcs[wcs_idx] = WEOF;
				782	if (pstr->mbs_allocated)
				783	memset (pstr->mbs, 255, pstr->valid_len);
				784	}
				785	pstr->valid_raw_len = pstr->valid_len;
				786	}
				787	else
				788	#endif /* RE_ENABLE_I18N */
				789	{
				790	int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
				791	pstr->valid_raw_len = 0;
				792	if (pstr->trans)
				793	c = pstr->trans[c];
				794	pstr->tip_context = (bitset_contain (pstr->word_char, c)
				795	? CONTEXT_WORD
				796	: ((IS_NEWLINE (c) && pstr->newline_anchor)
				797	? CONTEXT_NEWLINE : 0));
				798	}
				799	}
				800	if (!BE (pstr->mbs_allocated, 0))
				801	pstr->mbs += offset;
				802	}
				803	pstr->raw_mbs_idx = idx;
				804	pstr->len -= offset;
				805	pstr->stop -= offset;
				806
				807	/* Then build the buffers. */
				808	#ifdef RE_ENABLE_I18N
				809	if (pstr->mb_cur_max > 1)
				810	{
				811	if (pstr->icase)
				812	{
				813	reg_errcode_t ret = build_wcs_upper_buffer (pstr);
				814	if (BE (ret != REG_NOERROR, 0))
				815	return ret;
				816	}
				817	else
				818	build_wcs_buffer (pstr);
				819	}
				820	else
				821	#endif /* RE_ENABLE_I18N */
				822	if (BE (pstr->mbs_allocated, 0))
				823	{
				824	if (pstr->icase)
				825	build_upper_buffer (pstr);
				826	else if (pstr->trans != NULL)
				827	re_string_translate_buffer (pstr);
				828	}
				829	else
				830	pstr->valid_len = pstr->len;
				831
				832	pstr->cur_idx = 0;
				833	return REG_NOERROR;
				834	}
				835
				836	static unsigned char
				837	internal_function __attribute ((pure))
				838	re_string_peek_byte_case (const re_string_t *pstr, int idx)
				839	{
				840	int ch, off;
				841
				842	/* Handle the common (easiest) cases first. */
				843	if (BE (!pstr->mbs_allocated, 1))
				844	return re_string_peek_byte (pstr, idx);
				845
				846	#ifdef RE_ENABLE_I18N
				847	if (pstr->mb_cur_max > 1
				848	&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
				849	return re_string_peek_byte (pstr, idx);
				850	#endif
				851
				852	off = pstr->cur_idx + idx;
				853	#ifdef RE_ENABLE_I18N
				854	if (pstr->offsets_needed)
				855	off = pstr->offsets[off];
				856	#endif
				857
				858	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
				859
				860	#ifdef RE_ENABLE_I18N
				861	/* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
				862	this function returns CAPITAL LETTER I instead of first byte of
				863	DOTLESS SMALL LETTER I. The latter would confuse the parser,
				864	since peek_byte_case doesn't advance cur_idx in any way. */
				865	if (pstr->offsets_needed && !isascii (ch))
				866	return re_string_peek_byte (pstr, idx);
				867	#endif
				868
				869	return ch;
				870	}
				871
				872	static unsigned char
				873	internal_function __attribute ((pure))
				874	re_string_fetch_byte_case (re_string_t *pstr)
				875	{
				876	if (BE (!pstr->mbs_allocated, 1))
				877	return re_string_fetch_byte (pstr);
				878
				879	#ifdef RE_ENABLE_I18N
				880	if (pstr->offsets_needed)
				881	{
				882	int off, ch;
				883
				884	/* For tr_TR.UTF-8 [[:islower:]] there is
				885	[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
				886	in that case the whole multi-byte character and return
				887	the original letter. On the other side, with
				888	[[: DOTLESS SMALL LETTER I return [[:I, as doing
				889	anything else would complicate things too much. */
				890
				891	if (!re_string_first_byte (pstr, pstr->cur_idx))
				892	return re_string_fetch_byte (pstr);
				893
				894	off = pstr->offsets[pstr->cur_idx];
				895	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
				896
				897	if (! isascii (ch))
				898	return re_string_fetch_byte (pstr);
				899
				900	re_string_skip_bytes (pstr,
				901	re_string_char_size_at (pstr, pstr->cur_idx));
				902	return ch;
				903	}
				904	#endif
				905
				906	return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
				907	}
				908
				909	static void
				910	internal_function
				911	re_string_destruct (re_string_t *pstr)
				912	{
				913	#ifdef RE_ENABLE_I18N
				914	re_free (pstr->wcs);
				915	re_free (pstr->offsets);
				916	#endif /* RE_ENABLE_I18N */
				917	if (pstr->mbs_allocated)
				918	re_free (pstr->mbs);
				919	}
				920
				921	/* Return the context at IDX in INPUT. */
				922
				923	static unsigned int
				924	internal_function
				925	re_string_context_at (const re_string_t *input, int idx, int eflags)
				926	{
				927	int c;
				928	if (BE (idx < 0, 0))
				929	/* In this case, we use the value stored in input->tip_context,
				930	since we can't know the character in input->mbs[-1] here. */
				931	return input->tip_context;
				932	if (BE (idx == input->len, 0))
				933	return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
				934	: CONTEXT_NEWLINE \| CONTEXT_ENDBUF);
				935	#ifdef RE_ENABLE_I18N
				936	if (input->mb_cur_max > 1)
				937	{
				938	wint_t wc;
				939	int wc_idx = idx;
				940	while(input->wcs[wc_idx] == WEOF)
				941	{
				942	#ifdef DEBUG
				943	/* It must not happen. */
				944	assert (wc_idx >= 0);
				945	#endif
				946	--wc_idx;
				947	if (wc_idx < 0)
				948	return input->tip_context;
				949	}
				950	wc = input->wcs[wc_idx];
				951	if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
				952	return CONTEXT_WORD;
				953	return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
				954	? CONTEXT_NEWLINE : 0);
				955	}
				956	else
				957	#endif
				958	{
				959	c = re_string_byte_at (input, idx);
				960	if (bitset_contain (input->word_char, c))
				961	return CONTEXT_WORD;
				962	return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
				963	}
				964	}
				965
				966	/* Functions for set operation. */
				967
				968	static reg_errcode_t
				969	internal_function
				970	re_node_set_alloc (re_node_set *set, int size)
				971	{
				972	/*
				973	* ADR: valgrind says size can be 0, which then doesn't
				974	* free the block of size 0. Harumph. This seems
				975	* to work ok, though.
				976	*/
				977	if (size == 0)
				978	{
				979	memset(set, 0, sizeof(*set));
				980	return REG_NOERROR;
				981	}
				982	set->alloc = size;
				983	set->nelem = 0;
				984	set->elems = re_malloc (int, size);
				985	if (BE (set->elems == NULL, 0))
				986	return REG_ESPACE;
				987	return REG_NOERROR;
				988	}
				989
				990	static reg_errcode_t
				991	internal_function
				992	re_node_set_init_1 (re_node_set *set, int elem)
				993	{
				994	set->alloc = 1;
				995	set->nelem = 1;
				996	set->elems = re_malloc (int, 1);
				997	if (BE (set->elems == NULL, 0))
				998	{
				999	set->alloc = set->nelem = 0;
				1000	return REG_ESPACE;
				1001	}
				1002	set->elems[0] = elem;
				1003	return REG_NOERROR;
				1004	}
				1005
				1006	static reg_errcode_t
				1007	internal_function
				1008	re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
				1009	{
				1010	set->alloc = 2;
				1011	set->elems = re_malloc (int, 2);
				1012	if (BE (set->elems == NULL, 0))
				1013	return REG_ESPACE;
				1014	if (elem1 == elem2)
				1015	{
				1016	set->nelem = 1;
				1017	set->elems[0] = elem1;
				1018	}
				1019	else
				1020	{
				1021	set->nelem = 2;
				1022	if (elem1 < elem2)
				1023	{
				1024	set->elems[0] = elem1;
				1025	set->elems[1] = elem2;
				1026	}
				1027	else
				1028	{
				1029	set->elems[0] = elem2;
				1030	set->elems[1] = elem1;
				1031	}
				1032	}
				1033	return REG_NOERROR;
				1034	}
				1035
				1036	static reg_errcode_t
				1037	internal_function
				1038	re_node_set_init_copy (re_node_set dest, const re_node_set src)
				1039	{
				1040	dest->nelem = src->nelem;
				1041	if (src->nelem > 0)
				1042	{
				1043	dest->alloc = dest->nelem;
				1044	dest->elems = re_malloc (int, dest->alloc);
				1045	if (BE (dest->elems == NULL, 0))
				1046	{
				1047	dest->alloc = dest->nelem = 0;
				1048	return REG_ESPACE;
				1049	}
				1050	memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
				1051	}
				1052	else
				1053	re_node_set_init_empty (dest);
				1054	return REG_NOERROR;
				1055	}
				1056
				1057	/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
				1058	DEST. Return value indicate the error code or REG_NOERROR if succeeded.
				1059	Note: We assume dest->elems is NULL, when dest->alloc is 0. */
				1060
				1061	static reg_errcode_t
				1062	internal_function
				1063	re_node_set_add_intersect (re_node_set dest, const re_node_set src1,
				1064	const re_node_set *src2)
				1065	{
				1066	int i1, i2, is, id, delta, sbase;
				1067	if (src1->nelem == 0 \|\| src2->nelem == 0)
				1068	return REG_NOERROR;
				1069
				1070	/* We need dest->nelem + 2 * elems_in_intersection; this is a
				1071	conservative estimate. */
				1072	if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
				1073	{
				1074	int new_alloc = src1->nelem + src2->nelem + dest->alloc;
				1075	int *new_elems = re_realloc (dest->elems, int, new_alloc);
				1076	if (BE (new_elems == NULL, 0))
				1077	return REG_ESPACE;
				1078	dest->elems = new_elems;
				1079	dest->alloc = new_alloc;
				1080	}
				1081
				1082	/* Find the items in the intersection of SRC1 and SRC2, and copy
				1083	into the top of DEST those that are not already in DEST itself. */
				1084	sbase = dest->nelem + src1->nelem + src2->nelem;
				1085	i1 = src1->nelem - 1;
				1086	i2 = src2->nelem - 1;
				1087	id = dest->nelem - 1;
				1088	for (;;)
				1089	{
				1090	if (src1->elems[i1] == src2->elems[i2])
				1091	{
				1092	/* Try to find the item in DEST. Maybe we could binary search? */
				1093	while (id >= 0 && dest->elems[id] > src1->elems[i1])
				1094	--id;
				1095
				1096	if (id < 0 \|\| dest->elems[id] != src1->elems[i1])
				1097	dest->elems[--sbase] = src1->elems[i1];
				1098
				1099	if (--i1 < 0 \|\| --i2 < 0)
				1100	break;
				1101	}
				1102
				1103	/* Lower the highest of the two items. */
				1104	else if (src1->elems[i1] < src2->elems[i2])
				1105	{
				1106	if (--i2 < 0)
				1107	break;
				1108	}
				1109	else
				1110	{
				1111	if (--i1 < 0)
				1112	break;
				1113	}
				1114	}
				1115
				1116	id = dest->nelem - 1;
				1117	is = dest->nelem + src1->nelem + src2->nelem - 1;
				1118	delta = is - sbase + 1;
				1119
				1120	/* Now copy. When DELTA becomes zero, the remaining
				1121	DEST elements are already in place; this is more or
				1122	less the same loop that is in re_node_set_merge. */
				1123	dest->nelem += delta;
				1124	if (delta > 0 && id >= 0)
				1125	for (;;)
				1126	{
				1127	if (dest->elems[is] > dest->elems[id])
				1128	{
				1129	/* Copy from the top. */
				1130	dest->elems[id + delta--] = dest->elems[is--];
				1131	if (delta == 0)
				1132	break;
				1133	}
				1134	else
				1135	{
				1136	/* Slide from the bottom. */
				1137	dest->elems[id + delta] = dest->elems[id];
				1138	if (--id < 0)
				1139	break;
				1140	}
				1141	}
				1142
				1143	/* Copy remaining SRC elements. */
				1144	memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
				1145
				1146	return REG_NOERROR;
				1147	}
				1148
				1149	/* Calculate the union set of the sets SRC1 and SRC2. And store it to
				1150	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
				1151
				1152	static reg_errcode_t
				1153	internal_function
				1154	re_node_set_init_union (re_node_set dest, const re_node_set src1,
				1155	const re_node_set *src2)
				1156	{
				1157	int i1, i2, id;
				1158	if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
				1159	{
				1160	dest->alloc = src1->nelem + src2->nelem;
				1161	dest->elems = re_malloc (int, dest->alloc);
				1162	if (BE (dest->elems == NULL, 0))
				1163	return REG_ESPACE;
				1164	}
				1165	else
				1166	{
				1167	if (src1 != NULL && src1->nelem > 0)
				1168	return re_node_set_init_copy (dest, src1);
				1169	else if (src2 != NULL && src2->nelem > 0)
				1170	return re_node_set_init_copy (dest, src2);
				1171	else
				1172	re_node_set_init_empty (dest);
				1173	return REG_NOERROR;
				1174	}
				1175	for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
				1176	{
				1177	if (src1->elems[i1] > src2->elems[i2])
				1178	{
				1179	dest->elems[id++] = src2->elems[i2++];
				1180	continue;
				1181	}
				1182	if (src1->elems[i1] == src2->elems[i2])
				1183	++i2;
				1184	dest->elems[id++] = src1->elems[i1++];
				1185	}
				1186	if (i1 < src1->nelem)
				1187	{
				1188	memcpy (dest->elems + id, src1->elems + i1,
				1189	(src1->nelem - i1) * sizeof (int));
				1190	id += src1->nelem - i1;
				1191	}
				1192	else if (i2 < src2->nelem)
				1193	{
				1194	memcpy (dest->elems + id, src2->elems + i2,
				1195	(src2->nelem - i2) * sizeof (int));
				1196	id += src2->nelem - i2;
				1197	}
				1198	dest->nelem = id;
				1199	return REG_NOERROR;
				1200	}
				1201
				1202	/* Calculate the union set of the sets DEST and SRC. And store it to
				1203	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
				1204
				1205	static reg_errcode_t
				1206	internal_function
				1207	re_node_set_merge (re_node_set dest, const re_node_set src)
				1208	{
				1209	int is, id, sbase, delta;
				1210	if (src == NULL \|\| src->nelem == 0)
				1211	return REG_NOERROR;
				1212	if (dest->alloc < 2 * src->nelem + dest->nelem)
				1213	{
				1214	int new_alloc = 2 * (src->nelem + dest->alloc);
				1215	int *new_buffer = re_realloc (dest->elems, int, new_alloc);
				1216	if (BE (new_buffer == NULL, 0))
				1217	return REG_ESPACE;
				1218	dest->elems = new_buffer;
				1219	dest->alloc = new_alloc;
				1220	}
				1221
				1222	if (BE (dest->nelem == 0, 0))
				1223	{
				1224	dest->nelem = src->nelem;
				1225	memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
				1226	return REG_NOERROR;
				1227	}
				1228
				1229	/* Copy into the top of DEST the items of SRC that are not
				1230	found in DEST. Maybe we could binary search in DEST? */
				1231	for (sbase = dest->nelem + 2 * src->nelem,
				1232	is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
				1233	{
				1234	if (dest->elems[id] == src->elems[is])
				1235	is--, id--;
				1236	else if (dest->elems[id] < src->elems[is])
				1237	dest->elems[--sbase] = src->elems[is--];
				1238	else /* if (dest->elems[id] > src->elems[is]) */
				1239	--id;
				1240	}
				1241
				1242	if (is >= 0)
				1243	{
				1244	/* If DEST is exhausted, the remaining items of SRC must be unique. */
				1245	sbase -= is + 1;
				1246	memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
				1247	}
				1248
				1249	id = dest->nelem - 1;
				1250	is = dest->nelem + 2 * src->nelem - 1;
				1251	delta = is - sbase + 1;
				1252	if (delta == 0)
				1253	return REG_NOERROR;
				1254
				1255	/* Now copy. When DELTA becomes zero, the remaining
				1256	DEST elements are already in place. */
				1257	dest->nelem += delta;
				1258	for (;;)
				1259	{
				1260	if (dest->elems[is] > dest->elems[id])
				1261	{
				1262	/* Copy from the top. */
				1263	dest->elems[id + delta--] = dest->elems[is--];
				1264	if (delta == 0)
				1265	break;
				1266	}
				1267	else
				1268	{
				1269	/* Slide from the bottom. */
				1270	dest->elems[id + delta] = dest->elems[id];
				1271	if (--id < 0)
				1272	{
				1273	/* Copy remaining SRC elements. */
				1274	memcpy (dest->elems, dest->elems + sbase,
				1275	delta * sizeof (int));
				1276	break;
				1277	}
				1278	}
				1279	}
				1280
				1281	return REG_NOERROR;
				1282	}
				1283
				1284	/* Insert the new element ELEM to the re_node_set* SET.
				1285	SET should not already have ELEM.
Stefano Lattarini	ce9171c	2013-04-12 00:36:10 +0200	[diff] [blame]	1286	return -1 if an error has occurred, return 1 otherwise. */
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	1287
				1288	static int
				1289	internal_function
				1290	re_node_set_insert (re_node_set *set, int elem)
				1291	{
				1292	int idx;
				1293	/* In case the set is empty. */
				1294	if (set->alloc == 0)
				1295	{
				1296	if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
				1297	return 1;
				1298	else
				1299	return -1;
				1300	}
				1301
				1302	if (BE (set->nelem, 0) == 0)
				1303	{
				1304	/* We already guaranteed above that set->alloc != 0. */
				1305	set->elems[0] = elem;
				1306	++set->nelem;
				1307	return 1;
				1308	}
				1309
				1310	/* Realloc if we need. */
				1311	if (set->alloc == set->nelem)
				1312	{
				1313	int *new_elems;
				1314	set->alloc = set->alloc * 2;
				1315	new_elems = re_realloc (set->elems, int, set->alloc);
				1316	if (BE (new_elems == NULL, 0))
				1317	return -1;
				1318	set->elems = new_elems;
				1319	}
				1320
				1321	/* Move the elements which follows the new element. Test the
				1322	first element separately to skip a check in the inner loop. */
				1323	if (elem < set->elems[0])
				1324	{
				1325	idx = 0;
				1326	for (idx = set->nelem; idx > 0; idx--)
				1327	set->elems[idx] = set->elems[idx - 1];
				1328	}
				1329	else
				1330	{
				1331	for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
				1332	set->elems[idx] = set->elems[idx - 1];
				1333	}
				1334
				1335	/* Insert the new element. */
				1336	set->elems[idx] = elem;
				1337	++set->nelem;
				1338	return 1;
				1339	}
				1340
				1341	/* Insert the new element ELEM to the re_node_set* SET.
				1342	SET should not already have any element greater than or equal to ELEM.
Stefano Lattarini	ce9171c	2013-04-12 00:36:10 +0200	[diff] [blame]	1343	Return -1 if an error has occurred, return 1 otherwise. */
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	1344
				1345	static int
				1346	internal_function
				1347	re_node_set_insert_last (re_node_set *set, int elem)
				1348	{
				1349	/* Realloc if we need. */
				1350	if (set->alloc == set->nelem)
				1351	{
				1352	int *new_elems;
				1353	set->alloc = (set->alloc + 1) * 2;
				1354	new_elems = re_realloc (set->elems, int, set->alloc);
				1355	if (BE (new_elems == NULL, 0))
				1356	return -1;
				1357	set->elems = new_elems;
				1358	}
				1359
				1360	/* Insert the new element. */
				1361	set->elems[set->nelem++] = elem;
				1362	return 1;
				1363	}
				1364
				1365	/* Compare two node sets SET1 and SET2.
				1366	return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
				1367
				1368	static int
				1369	internal_function __attribute ((pure))
				1370	re_node_set_compare (const re_node_set set1, const re_node_set set2)
				1371	{
				1372	int i;
				1373	if (set1 == NULL \|\| set2 == NULL \|\| set1->nelem != set2->nelem)
				1374	return 0;
				1375	for (i = set1->nelem ; --i >= 0 ; )
				1376	if (set1->elems[i] != set2->elems[i])
				1377	return 0;
				1378	return 1;
				1379	}
				1380
				1381	/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
				1382
				1383	static int
				1384	internal_function __attribute ((pure))
				1385	re_node_set_contains (const re_node_set *set, int elem)
				1386	{
				1387	unsigned int idx, right, mid;
				1388	if (set->nelem <= 0)
				1389	return 0;
				1390
				1391	/* Binary search the element. */
				1392	idx = 0;
				1393	right = set->nelem - 1;
				1394	while (idx < right)
				1395	{
Derrick Stolee	19716b2	2017-10-08 14:29:37 -0400	[diff] [blame]	1396	mid = idx + (right - idx) / 2;
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	1397	if (set->elems[mid] < elem)
				1398	idx = mid + 1;
				1399	else
				1400	right = mid;
				1401	}
				1402	return set->elems[idx] == elem ? idx + 1 : 0;
				1403	}
				1404
				1405	static void
				1406	internal_function
				1407	re_node_set_remove_at (re_node_set *set, int idx)
				1408	{
				1409	if (idx < 0 \|\| idx >= set->nelem)
				1410	return;
				1411	--set->nelem;
				1412	for (; idx < set->nelem; idx++)
				1413	set->elems[idx] = set->elems[idx + 1];
				1414	}
				1415
				1416
				1417	/* Add the token TOKEN to dfa->nodes, and return the index of the token.
Stefano Lattarini	ce9171c	2013-04-12 00:36:10 +0200	[diff] [blame]	1418	Or return -1, if an error has occurred. */
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	1419
				1420	static int
				1421	internal_function
				1422	re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
				1423	{
				1424	if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
				1425	{
				1426	size_t new_nodes_alloc = dfa->nodes_alloc * 2;
				1427	int new_nexts, new_indices;
				1428	re_node_set new_edests, new_eclosures;
				1429	re_token_t *new_nodes;
				1430
				1431	/* Avoid overflows in realloc. */
				1432	const size_t max_object_size = MAX (sizeof (re_token_t),
				1433	MAX (sizeof (re_node_set),
				1434	sizeof (int)));
				1435	if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
				1436	return -1;
				1437
				1438	new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
				1439	if (BE (new_nodes == NULL, 0))
				1440	return -1;
				1441	dfa->nodes = new_nodes;
				1442	new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
				1443	new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
				1444	new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
				1445	new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
				1446	if (BE (new_nexts == NULL \|\| new_indices == NULL
				1447	\|\| new_edests == NULL \|\| new_eclosures == NULL, 0))
				1448	return -1;
				1449	dfa->nexts = new_nexts;
				1450	dfa->org_indices = new_indices;
				1451	dfa->edests = new_edests;
				1452	dfa->eclosures = new_eclosures;
				1453	dfa->nodes_alloc = new_nodes_alloc;
				1454	}
				1455	dfa->nodes[dfa->nodes_len] = token;
				1456	dfa->nodes[dfa->nodes_len].constraint = 0;
				1457	#ifdef RE_ENABLE_I18N
				1458	dfa->nodes[dfa->nodes_len].accept_mb =
				1459	(token.type == OP_PERIOD && dfa->mb_cur_max > 1) \|\| token.type == COMPLEX_BRACKET;
				1460	#endif
				1461	dfa->nexts[dfa->nodes_len] = -1;
				1462	re_node_set_init_empty (dfa->edests + dfa->nodes_len);
				1463	re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
				1464	return dfa->nodes_len++;
				1465	}
				1466
				1467	static inline unsigned int
				1468	internal_function
				1469	calc_state_hash (const re_node_set *nodes, unsigned int context)
				1470	{
				1471	unsigned int hash = nodes->nelem + context;
				1472	int i;
				1473	for (i = 0 ; i < nodes->nelem ; i++)
				1474	hash += nodes->elems[i];
				1475	return hash;
				1476	}
				1477
				1478	/* Search for the state whose node_set is equivalent to NODES.
				1479	Return the pointer to the state, if we found it in the DFA.
				1480	Otherwise create the new one and return it. In case of an error
				1481	return NULL and set the error code in ERR.
				1482	Note: - We assume NULL as the invalid state, then it is possible that
				1483	return value is NULL and ERR is REG_NOERROR.
				1484	- We never return non-NULL value in case of any errors, it is for
				1485	optimization. */
				1486
				1487	static re_dfastate_t *
				1488	internal_function
				1489	re_acquire_state (reg_errcode_t err, const re_dfa_t dfa,
				1490	const re_node_set *nodes)
				1491	{
				1492	unsigned int hash;
				1493	re_dfastate_t *new_state;
				1494	struct re_state_table_entry *spot;
				1495	int i;
				1496	if (BE (nodes->nelem == 0, 0))
				1497	{
				1498	*err = REG_NOERROR;
				1499	return NULL;
				1500	}
				1501	hash = calc_state_hash (nodes, 0);
				1502	spot = dfa->state_table + (hash & dfa->state_hash_mask);
				1503
				1504	for (i = 0 ; i < spot->num ; i++)
				1505	{
				1506	re_dfastate_t *state = spot->array[i];
				1507	if (hash != state->hash)
				1508	continue;
				1509	if (re_node_set_compare (&state->nodes, nodes))
				1510	return state;
				1511	}
				1512
				1513	/* There are no appropriate state in the dfa, create the new one. */
				1514	new_state = create_ci_newstate (dfa, nodes, hash);
				1515	if (BE (new_state == NULL, 0))
				1516	*err = REG_ESPACE;
				1517
				1518	return new_state;
				1519	}
				1520
				1521	/* Search for the state whose node_set is equivalent to NODES and
				1522	whose context is equivalent to CONTEXT.
				1523	Return the pointer to the state, if we found it in the DFA.
				1524	Otherwise create the new one and return it. In case of an error
				1525	return NULL and set the error code in ERR.
				1526	Note: - We assume NULL as the invalid state, then it is possible that
				1527	return value is NULL and ERR is REG_NOERROR.
				1528	- We never return non-NULL value in case of any errors, it is for
				1529	optimization. */
				1530
				1531	static re_dfastate_t *
				1532	internal_function
				1533	re_acquire_state_context (reg_errcode_t err, const re_dfa_t dfa,
				1534	const re_node_set *nodes, unsigned int context)
				1535	{
				1536	unsigned int hash;
				1537	re_dfastate_t *new_state;
				1538	struct re_state_table_entry *spot;
				1539	int i;
				1540	if (nodes->nelem == 0)
				1541	{
				1542	*err = REG_NOERROR;
				1543	return NULL;
				1544	}
				1545	hash = calc_state_hash (nodes, context);
				1546	spot = dfa->state_table + (hash & dfa->state_hash_mask);
				1547
				1548	for (i = 0 ; i < spot->num ; i++)
				1549	{
				1550	re_dfastate_t *state = spot->array[i];
				1551	if (state->hash == hash
				1552	&& state->context == context
				1553	&& re_node_set_compare (state->entrance_nodes, nodes))
				1554	return state;
				1555	}
				1556	/* There are no appropriate state in `dfa', create the new one. */
				1557	new_state = create_cd_newstate (dfa, nodes, context, hash);
				1558	if (BE (new_state == NULL, 0))
				1559	*err = REG_ESPACE;
				1560
				1561	return new_state;
				1562	}
				1563
				1564	/* Finish initialization of the new state NEWSTATE, and using its hash value
				1565	HASH put in the appropriate bucket of DFA's state table. Return value
				1566	indicates the error code if failed. */
				1567
				1568	static reg_errcode_t
				1569	register_state (const re_dfa_t dfa, re_dfastate_t newstate,
				1570	unsigned int hash)
				1571	{
				1572	struct re_state_table_entry *spot;
				1573	reg_errcode_t err;
				1574	int i;
				1575
				1576	newstate->hash = hash;
				1577	err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
				1578	if (BE (err != REG_NOERROR, 0))
				1579	return REG_ESPACE;
				1580	for (i = 0; i < newstate->nodes.nelem; i++)
				1581	{
				1582	int elem = newstate->nodes.elems[i];
				1583	if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
				1584	if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
				1585	return REG_ESPACE;
				1586	}
				1587
				1588	spot = dfa->state_table + (hash & dfa->state_hash_mask);
				1589	if (BE (spot->alloc <= spot->num, 0))
				1590	{
				1591	int new_alloc = 2 * spot->num + 2;
				1592	re_dfastate_t *new_array = re_realloc (spot->array, re_dfastate_t ,
				1593	new_alloc);
				1594	if (BE (new_array == NULL, 0))
				1595	return REG_ESPACE;
				1596	spot->array = new_array;
				1597	spot->alloc = new_alloc;
				1598	}
				1599	spot->array[spot->num++] = newstate;
				1600	return REG_NOERROR;
				1601	}
				1602
				1603	static void
				1604	free_state (re_dfastate_t *state)
				1605	{
				1606	re_node_set_free (&state->non_eps_nodes);
				1607	re_node_set_free (&state->inveclosure);
				1608	if (state->entrance_nodes != &state->nodes)
				1609	{
				1610	re_node_set_free (state->entrance_nodes);
				1611	re_free (state->entrance_nodes);
				1612	}
				1613	re_node_set_free (&state->nodes);
				1614	re_free (state->word_trtable);
				1615	re_free (state->trtable);
				1616	re_free (state);
				1617	}
				1618
Elijah Newren	03670c8	2019-11-05 17:07:30 +0000	[diff] [blame]	1619	/* Create the new state which is independent of contexts.
Ævar Arnfjörð Bjarmason	d18f76d	2010-08-17 09:24:38 +0000	[diff] [blame]	1620	Return the new state if succeeded, otherwise return NULL. */
				1621
				1622	static re_dfastate_t *
				1623	internal_function
				1624	create_ci_newstate (const re_dfa_t dfa, const re_node_set nodes,
				1625	unsigned int hash)
				1626	{
				1627	int i;
				1628	reg_errcode_t err;
				1629	re_dfastate_t *newstate;
				1630
				1631	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
				1632	if (BE (newstate == NULL, 0))
				1633	return NULL;
				1634	err = re_node_set_init_copy (&newstate->nodes, nodes);
				1635	if (BE (err != REG_NOERROR, 0))
				1636	{
				1637	re_free (newstate);
				1638	return NULL;
				1639	}
				1640
				1641	newstate->entrance_nodes = &newstate->nodes;
				1642	for (i = 0 ; i < nodes->nelem ; i++)
				1643	{
				1644	re_token_t *node = dfa->nodes + nodes->elems[i];
				1645	re_token_type_t type = node->type;
				1646	if (type == CHARACTER && !node->constraint)
				1647	continue;
				1648	#ifdef RE_ENABLE_I18N
				1649	newstate->accept_mb \|= node->accept_mb;
				1650	#endif /* RE_ENABLE_I18N */
				1651
				1652	/* If the state has the halt node, the state is a halt state. */
				1653	if (type == END_OF_RE)
				1654	newstate->halt = 1;
				1655	else if (type == OP_BACK_REF)
				1656	newstate->has_backref = 1;
				1657	else if (type == ANCHOR \|\| node->constraint)
				1658	newstate->has_constraint = 1;
				1659	}
				1660	err = register_state (dfa, newstate, hash);
				1661	if (BE (err != REG_NOERROR, 0))
				1662	{
				1663	free_state (newstate);
				1664	newstate = NULL;
				1665	}
				1666	return newstate;
				1667	}
				1668
				1669	/* Create the new state which is depend on the context CONTEXT.
				1670	Return the new state if succeeded, otherwise return NULL. */
				1671
				1672	static re_dfastate_t *
				1673	internal_function
				1674	create_cd_newstate (const re_dfa_t dfa, const re_node_set nodes,
				1675	unsigned int context, unsigned int hash)
				1676	{
				1677	int i, nctx_nodes = 0;
				1678	reg_errcode_t err;
				1679	re_dfastate_t *newstate;
				1680
				1681	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
				1682	if (BE (newstate == NULL, 0))
				1683	return NULL;
				1684	err = re_node_set_init_copy (&newstate->nodes, nodes);
				1685	if (BE (err != REG_NOERROR, 0))
				1686	{
				1687	re_free (newstate);
				1688	return NULL;
				1689	}
				1690
				1691	newstate->context = context;
				1692	newstate->entrance_nodes = &newstate->nodes;
				1693
				1694	for (i = 0 ; i < nodes->nelem ; i++)
				1695	{
				1696	re_token_t *node = dfa->nodes + nodes->elems[i];
				1697	re_token_type_t type = node->type;
				1698	unsigned int constraint = node->constraint;
				1699
				1700	if (type == CHARACTER && !constraint)
				1701	continue;
				1702	#ifdef RE_ENABLE_I18N
				1703	newstate->accept_mb \|= node->accept_mb;
				1704	#endif /* RE_ENABLE_I18N */
				1705
				1706	/* If the state has the halt node, the state is a halt state. */
				1707	if (type == END_OF_RE)
				1708	newstate->halt = 1;
				1709	else if (type == OP_BACK_REF)
				1710	newstate->has_backref = 1;
				1711
				1712	if (constraint)
				1713	{
				1714	if (newstate->entrance_nodes == &newstate->nodes)
				1715	{
				1716	newstate->entrance_nodes = re_malloc (re_node_set, 1);
				1717	if (BE (newstate->entrance_nodes == NULL, 0))
				1718	{
				1719	free_state (newstate);
				1720	return NULL;
				1721	}
				1722	if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
				1723	!= REG_NOERROR)
				1724	return NULL;
				1725	nctx_nodes = 0;
				1726	newstate->has_constraint = 1;
				1727	}
				1728
				1729	if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
				1730	{
				1731	re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
				1732	++nctx_nodes;
				1733	}
				1734	}
				1735	}
				1736	err = register_state (dfa, newstate, hash);
				1737	if (BE (err != REG_NOERROR, 0))
				1738	{
				1739	free_state (newstate);
				1740	newstate = NULL;
				1741	}
				1742	return newstate;
				1743	}