summaryrefslogtreecommitdiff
path: root/lib-src/globs2ere.awk
blob: 064a2c413091c2b9f7a4e28e6cc8d28833805d8e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# Copyright (C) 2023  Sean Whitton <spwhitton@spwhitton.name>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# Where each input record is a glob, output a single ERE matching the
# disjunction of all the non-empty input records.
# This is for matching, not expansion: '/' and '.' are not treated specially.
# There is no shell quotation removal, and we do not yet support collating
# symbols or equivalence classes within bracket expressions.
# There is no input validation.

# One field per record.
BEGIN { FS = RS }

function getchar () { c = substr($0, ++i, 1) }

length {
	res[++rl] = "^"
	while (i < length) {
		getchar()
		if (c == "*") {
			if (rl == 1)
				rl--
			else if (res[rl] != ".*")
				res[++rl] = ".*"
		} else if (c == "?")
			res[++rl] = "."
		else if (c == "[") {
			res[++rl] = "["; getchar()
			if (c == "!") { res[++rl] = "^"; getchar() }
			if (c == "]") { res[++rl] = "]"; getchar() }
			if (c == "^") { circ = 1; getchar() } else circ = 0
			while (c != "]") {
				rest = substr($0, i)
				if (match(rest, /^\[:[a-z]+:\]/) == 1) {
					res[++rl] = substr(rest, 1, RLENGTH)
					i += RLENGTH; c = substr($0, i, 1)
				} else
					res[++rl] = c; getchar()
			}
			res[++rl] = circ ? "^]" : "]"
		} else if (c == "\\") {
			getchar()
			escaped(c)
		} else
			escaped(c)
	}
	if (res[rl] == ".*")
		rl--
	else
		res[++rl] = "$"

	j++
	for (i = 1; i <= rl; i++)
		all[j] = all[j] res[i]

	i = 0; rl = 0; split("", res)
}

# In an ERE, we can use a backslash to escape any character.
# However, it is good to avoid generating longer EREs than are necessary.
# We do escape forward slashes, for ease of use with awk.
function escaped (c) { res[++rl] = c ~ /[[(.*+?{|^$\/\\]/ ? "\\" c : c }

END {
	if (j) {
		printf all[1]
		for (i = 2; i <= j; i++)
			printf "|%s", all[i]
		printf ORS
	}
}