From 9bb3b2f7a54d7c1a9b89f02a7e17bf0ef4f5cd6c Mon Sep 17 00:00:00 2001 From: Matthew Nibecker Date: Thu, 26 Oct 2023 12:22:18 -0700 Subject: [PATCH 1/9] Add grok function Closes #4140 --- docs/language/functions/README.md | 1 + docs/language/functions/grok.md | 44 +++++ pkg/grok/base.go | 75 ++++++++ pkg/grok/gen.go | 49 +++++ pkg/grok/grok-patterns | 98 ++++++++++ pkg/grok/grok.go | 238 +++++++++++++++++++++++++ pkg/grok/host_test.go | 100 +++++++++++ pkg/grok/patterns_test.go | 76 ++++++++ runtime/expr/function/function.go | 3 + runtime/expr/function/grok.go | 106 +++++++++++ runtime/expr/function/ztests/grok.yaml | 38 ++++ 11 files changed, 828 insertions(+) create mode 100644 docs/language/functions/grok.md create mode 100644 pkg/grok/base.go create mode 100644 pkg/grok/gen.go create mode 100644 pkg/grok/grok-patterns create mode 100644 pkg/grok/grok.go create mode 100644 pkg/grok/host_test.go create mode 100644 pkg/grok/patterns_test.go create mode 100644 runtime/expr/function/grok.go create mode 100644 runtime/expr/function/ztests/grok.yaml diff --git a/docs/language/functions/README.md b/docs/language/functions/README.md index 5c0dcc61b2..4bd878c34e 100644 --- a/docs/language/functions/README.md +++ b/docs/language/functions/README.md @@ -28,6 +28,7 @@ Zed's [primitive types](../../formats/zed.md#1-primitive-types), e.g., * [flatten](flatten.md) - transform a record into a flattened map * [floor](floor.md) - floor of a number * [grep](grep.md) - search strings inside of values +* [grok)](grok.md) - parse a string into a structured record * [has](has.md) - test existence of values * [hex](hex.md) - encode/decode hexadecimal strings * [has_error](has_error.md) - test if a value has an error diff --git a/docs/language/functions/grok.md b/docs/language/functions/grok.md new file mode 100644 index 0000000000..8db8e12151 --- /dev/null +++ b/docs/language/functions/grok.md @@ -0,0 +1,44 @@ +### Function + +  **grok** — parse a string using a grok pattern + +### Synopsis + +``` +grok(pattern: string, s: string) -> any +grok(pattern: string, s: string, definitions: string) -> any +``` + +### Description + +The _grok_ function parses a string using a grok pattern and returns +a record containing the parsed fields. The syntax for a grok pattern +is `{%pattern:field_name}` where _pattern_ is a the name of the pattern +to match text with and _field_name_ is resultant field name of the capture +value. + +When provided with three arguments the third argument, definitions, is a string +of named patterns seperated by new lines in the format `PATTERN_NAME PATTERN`. +The named patterns can then be referenced in the grok pattern argument. + +#### Included Patterns + +The _grok_ function by default includes a set of builtin named patterns +that can be referenced in any pattern. The included named patterns can be seen +[here](https://raw.githubusercontent.com/brimdata/zed/main/pkg/grok/grok-patterns). + +### Examples + +Parsing a simple log line using the builtin named patterns: +```mdtest-command +echo '"2020-09-16T04:20:42.45+01:00 DEBUG This is a sample debug log message"' \ + | zq -Z 'yield grok("%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:message}", this)' - +``` +=> +```mdtest-output +{ + timestamp: "2020-09-16T04:20:42.45+01:00", + level: "DEBUG", + message: "This is a sample debug log message" +} +``` diff --git a/pkg/grok/base.go b/pkg/grok/base.go new file mode 100644 index 0000000000..ee89950f2e --- /dev/null +++ b/pkg/grok/base.go @@ -0,0 +1,75 @@ +// Code generated by gen.go; DO NOT EDIT. + +package grok + +func NewBase() Host { + h := New() + h.Must("USERNAME", "[a-zA-Z0-9._-]+") + h.Must("USER", "%{USERNAME}") + h.Must("INT", "(?:[+-]?(?:[0-9]+))") + h.Must("BASE10NUM", "([+-]?(?:[0-9]+(?:\\.[0-9]+)?)|\\.[0-9]+)") + h.Must("NUMBER", "(?:%{BASE10NUM})") + h.Must("BASE16NUM", "[+-]?(?:0x)?(?:[0-9A-Fa-f]+)") + h.Must("BASE16FLOAT", "\\b[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\\.[0-9A-Fa-f]*)?)|(?:\\.[0-9A-Fa-f]+))\\b") + h.Must("POSINT", "\\b(?:[1-9][0-9]*)\\b") + h.Must("NONNEGINT", "\\b(?:[0-9]+)\\b") + h.Must("WORD", "\\b\\w+\\b") + h.Must("NOTSPACE", "\\S+") + h.Must("SPACE", "\\s*") + h.Must("DATA", ".*?") + h.Must("GREEDYDATA", ".*") + h.Must("QUOTEDSTRING", "\"([^\"\\\\]*(\\\\.[^\"\\\\]*)*)\"|\\'([^\\'\\\\]*(\\\\.[^\\'\\\\]*)*)\\'") + h.Must("UUID", "[A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12}") + h.Must("CISCOMAC", "(?:(?:[A-Fa-f0-9]{4}\\.){2}[A-Fa-f0-9]{4})") + h.Must("WINDOWSMAC", "(?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})") + h.Must("COMMONMAC", "(?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})") + h.Must("MAC", "(?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC})") + h.Must("IPV6", "((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:)))(%.+)?") + h.Must("IPV4", "(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)") + h.Must("IP", "(?:%{IPV6}|%{IPV4})") + h.Must("HOSTNAME", "\\b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\\.?|\\b)") + h.Must("HOST", "%{HOSTNAME}") + h.Must("IPORHOST", "(?:%{HOSTNAME}|%{IP})") + h.Must("HOSTPORT", "%{IPORHOST}:%{POSINT}") + h.Must("UNIXPATH", "(/[\\w_%!$@:.,-]?/?)(\\S+)?") + h.Must("WINPATH", "([A-Za-z]:|\\\\)(?:\\\\[^\\\\?*]*)+") + h.Must("PATH", "(?:%{UNIXPATH}|%{WINPATH})") + h.Must("TTY", "(?:/dev/(pts|tty([pq])?)(\\w+)?/?(?:[0-9]+))") + h.Must("URIPROTO", "[A-Za-z]+(\\+[A-Za-z+]+)?") + h.Must("URIHOST", "%{IPORHOST}(?::%{POSINT:port})?") + h.Must("URIPATH", "(?:/[A-Za-z0-9$.+!*'(){},~:;=@#%_\\-]*)+") + h.Must("URIPARAM", "\\?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\\-\\[\\]]*") + h.Must("URIPATHPARAM", "%{URIPATH}(?:%{URIPARAM})?") + h.Must("URI", "%{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?") + h.Must("MONTH", "\\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\\b") + h.Must("MONTHNUM", "(?:0?[1-9]|1[0-2])") + h.Must("MONTHDAY", "(?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9])") + h.Must("DAY", "(?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?)") + h.Must("YEAR", "(\\d\\d){1,2}") + h.Must("HOUR", "(?:2[0123]|[01]?[0-9])") + h.Must("MINUTE", "(?:[0-5][0-9])") + h.Must("SECOND", "(?:(?:[0-5][0-9]|60)(?:[:.,][0-9]+)?)") + h.Must("TIME", "([^0-9]?)%{HOUR}:%{MINUTE}(?::%{SECOND})([^0-9]?)") + h.Must("DATE_US", "%{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR}") + h.Must("DATE_EU", "%{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR}") + h.Must("ISO8601_TIMEZONE", "(?:Z|[+-]%{HOUR}(?::?%{MINUTE}))") + h.Must("ISO8601_SECOND", "(?:%{SECOND}|60)") + h.Must("TIMESTAMP_ISO8601", "%{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}?") + h.Must("DATE", "%{DATE_US}|%{DATE_EU}") + h.Must("DATESTAMP", "%{DATE}[- ]%{TIME}") + h.Must("TZ", "(?:[PMCE][SD]T|UTC|GMT)") + h.Must("DATESTAMP_RFC822", "%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ}") + h.Must("DATESTAMP_OTHER", "%{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR}") + h.Must("SYSLOGTIMESTAMP", "%{MONTH} +%{MONTHDAY} %{TIME}") + h.Must("PROG", "(?:[\\w._/%-]+)") + h.Must("SYSLOGPROG", "%{PROG:program}(?:\\[%{POSINT:pid}\\])?") + h.Must("SYSLOGHOST", "%{IPORHOST}") + h.Must("SYSLOGFACILITY", "<%{NONNEGINT:facility}.%{NONNEGINT:priority}>") + h.Must("HTTPDATE", "%{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT}") + h.Must("QS", "%{QUOTEDSTRING}") + h.Must("SYSLOGBASE", "%{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:") + h.Must("COMMONAPACHELOG", "%{IPORHOST:clientip} %{USER:ident} %{USER:auth} \\[%{HTTPDATE:timestamp}\\] \"(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})\" %{NUMBER:response} (?:%{NUMBER:bytes}|-)") + h.Must("COMBINEDAPACHELOG", "%{COMMONAPACHELOG} %{QS:referrer} %{QS:agent}") + h.Must("LOGLEVEL", "([A-a]lert|ALERT|[T|t]race|TRACE|[D|d]ebug|DEBUG|[N|n]otice|NOTICE|[I|i]nfo|INFO|[W|w]arn?(?:ing)?|WARN?(?:ING)?|[E|e]rr?(?:or)?|ERR?(?:OR)?|[C|c]rit?(?:ical)?|CRIT?(?:ICAL)?|[F|f]atal|FATAL|[S|s]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?)") + return h +} diff --git a/pkg/grok/gen.go b/pkg/grok/gen.go new file mode 100644 index 0000000000..0d2d0bcc71 --- /dev/null +++ b/pkg/grok/gen.go @@ -0,0 +1,49 @@ +//go:build ignore + +package main + +import ( + "bufio" + _ "embed" + "os" + "regexp" + "strings" + "text/template" +) + +//go:embed grok-patterns +var grokPatterns string + +const baseTemplate = `// Code generated by gen.go; DO NOT EDIT. + +package grok + +func NewBase() Host { + h := New(){{range .}}{{print "\n\t"}}h.Must({{printf "%q" .Name}}, {{printf "%q" .Pattern}}){{end}} + return h +} +` + +func must(err error) { + if err != nil { + panic(err) + } +} + +func main() { + type namedPattern struct{ Name, Pattern string } + var patterns []namedPattern + lineRegexp := regexp.MustCompile(`^(\w+)\s+(.+)$`) + scanner := bufio.NewScanner(strings.NewReader(grokPatterns)) + for scanner.Scan() { + if sub := lineRegexp.FindStringSubmatch(scanner.Text()); sub != nil { + patterns = append(patterns, namedPattern{Name: sub[1], Pattern: sub[2]}) + } + } + must(scanner.Err()) + f, err := os.Create("base.go") + must(err) + defer f.Close() + t := template.Must(template.New("base").Parse(baseTemplate)) + must(t.Execute(f, patterns)) +} diff --git a/pkg/grok/grok-patterns b/pkg/grok/grok-patterns new file mode 100644 index 0000000000..40ec27920a --- /dev/null +++ b/pkg/grok/grok-patterns @@ -0,0 +1,98 @@ +# Adapted from https://github.com/vjeantet/grok/blob/master/patterns/grok-patterns +USERNAME [a-zA-Z0-9._-]+ +USER %{USERNAME} +INT (?:[+-]?(?:[0-9]+)) +BASE10NUM ([+-]?(?:[0-9]+(?:\.[0-9]+)?)|\.[0-9]+) +NUMBER (?:%{BASE10NUM}) +BASE16NUM [+-]?(?:0x)?(?:[0-9A-Fa-f]+) +BASE16FLOAT \b[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+))\b + +POSINT \b(?:[1-9][0-9]*)\b +NONNEGINT \b(?:[0-9]+)\b +WORD \b\w+\b +NOTSPACE \S+ +SPACE \s* +DATA .*? +GREEDYDATA .* +QUOTEDSTRING "([^"\\]*(\\.[^"\\]*)*)"|\'([^\'\\]*(\\.[^\'\\]*)*)\' +UUID [A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12} + +# Networking +CISCOMAC (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4}) +WINDOWSMAC (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2}) +COMMONMAC (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2}) +MAC (?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC}) +IPV6 ((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)? +IPV4 (?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?) +IP (?:%{IPV6}|%{IPV4}) +HOSTNAME \b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b) +HOST %{HOSTNAME} +IPORHOST (?:%{HOSTNAME}|%{IP}) +HOSTPORT %{IPORHOST}:%{POSINT} + +# paths +UNIXPATH (/[\w_%!$@:.,-]?/?)(\S+)? +WINPATH ([A-Za-z]:|\\)(?:\\[^\\?*]*)+ +PATH (?:%{UNIXPATH}|%{WINPATH}) +TTY (?:/dev/(pts|tty([pq])?)(\w+)?/?(?:[0-9]+)) + +URIPROTO [A-Za-z]+(\+[A-Za-z+]+)? +URIHOST %{IPORHOST}(?::%{POSINT:port})? +# uripath comes loosely from RFC1738, but mostly from what Firefox +# doesn't turn into %XX +URIPATH (?:/[A-Za-z0-9$.+!*'(){},~:;=@#%_\-]*)+ +#URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)? +URIPARAM \?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\-\[\]]* +URIPATHPARAM %{URIPATH}(?:%{URIPARAM})? +URI %{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})? + +# Months: January, Feb, 3, 03, 12, December +MONTH \b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b +MONTHNUM (?:0?[1-9]|1[0-2]) +MONTHDAY (?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9]) + +# Days: Monday, Tue, Thu, etc... +DAY (?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?) + +# Years? +#YEAR (?>\d\d){1,2} +#c +YEAR (\d\d){1,2} + +HOUR (?:2[0123]|[01]?[0-9]) +MINUTE (?:[0-5][0-9]) +# '60' is a leap second in most time standards and thus is valid. +SECOND (?:(?:[0-5][0-9]|60)(?:[:.,][0-9]+)?) +#TIME (?!<[0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9]) +#c +TIME ([^0-9]?)%{HOUR}:%{MINUTE}(?::%{SECOND})([^0-9]?) +# datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it) +DATE_US %{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR} +DATE_EU %{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR} +ISO8601_TIMEZONE (?:Z|[+-]%{HOUR}(?::?%{MINUTE})) +ISO8601_SECOND (?:%{SECOND}|60) +TIMESTAMP_ISO8601 %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}? +DATE %{DATE_US}|%{DATE_EU} +DATESTAMP %{DATE}[- ]%{TIME} +TZ (?:[PMCE][SD]T|UTC|GMT) +DATESTAMP_RFC822 %{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ} +DATESTAMP_OTHER %{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR} + +# Syslog Dates: Month Day HH:MM:SS +SYSLOGTIMESTAMP %{MONTH} +%{MONTHDAY} %{TIME} +PROG (?:[\w._/%-]+) +SYSLOGPROG %{PROG:program}(?:\[%{POSINT:pid}\])? +SYSLOGHOST %{IPORHOST} +SYSLOGFACILITY <%{NONNEGINT:facility}.%{NONNEGINT:priority}> +HTTPDATE %{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT} + +# Shortcuts +QS %{QUOTEDSTRING} + +# Log formats +SYSLOGBASE %{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}: +COMMONAPACHELOG %{IPORHOST:clientip} %{USER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] "(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})" %{NUMBER:response} (?:%{NUMBER:bytes}|-) +COMBINEDAPACHELOG %{COMMONAPACHELOG} %{QS:referrer} %{QS:agent} + +# Log Levels +LOGLEVEL ([A-a]lert|ALERT|[T|t]race|TRACE|[D|d]ebug|DEBUG|[N|n]otice|NOTICE|[I|i]nfo|INFO|[W|w]arn?(?:ing)?|WARN?(?:ING)?|[E|e]rr?(?:or)?|ERR?(?:OR)?|[C|c]rit?(?:ical)?|CRIT?(?:ICAL)?|[F|f]atal|FATAL|[S|s]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?) diff --git a/pkg/grok/grok.go b/pkg/grok/grok.go new file mode 100644 index 0000000000..d36633c37b --- /dev/null +++ b/pkg/grok/grok.go @@ -0,0 +1,238 @@ +//go:generate go run gen.go + +// Adapted from github.com/logrusorgru/grokky +package grok + +import ( + "bufio" + "errors" + "fmt" + "io" + "regexp" + "sort" + "strings" +) + +var ( + // ErrEmptyName arises when pattern name is an empty string + ErrEmptyName = errors.New("an empty name") + // ErrEmptyExpression arises when expression is an empty string + ErrEmptyExpression = errors.New("an empty expression") + // ErrAlreadyExist arises when pattern with given name alrady exists + ErrAlreadyExist = errors.New("the pattern already exist") + // ErrNotExist arises when pattern with given name doesn't exists + ErrNotExist = errors.New("pattern doesn't exist") +) + +// Host is a patterns collection. Host does not need to be kept around +// after all need patterns are generated +type Host map[string]string + +// New returns new empty host +func New() Host { return make(Host) } + +// Add a new pattern to the Host. If a pattern name +// already exists the ErrAlreadyExists will be returned. +func (h Host) Add(name, expr string) error { + if name == "" { + return ErrEmptyName + } + if expr == "" { + return ErrEmptyExpression + } + if _, ok := h[name]; ok { + return ErrAlreadyExist + } + if _, err := h.compileExternal(expr); err != nil { + return err + } + h[name] = expr + return nil +} + +func (h Host) Must(name, expr string) { + if err := h.Add(name, expr); err != nil { + panic(fmt.Errorf("%s: %w", name, err)) + } +} + +func (h Host) compile(name string) (*Pattern, error) { + expr, ok := h[name] + if !ok { + return nil, ErrNotExist + } + return h.compileExternal(expr) +} + +var patternRegexp = regexp.MustCompile(`\%\{(\w+)(\:([\w\[\]\.]+)(\:(\w+))?)?}`) + +func (h Host) compileExternal(expr string) (*Pattern, error) { + subs := patternRegexp.FindAllString(expr, -1) + ts := make(map[string]struct{}) + for _, s := range subs { + name, sem := split(s) + if _, ok := h[name]; !ok { + return nil, fmt.Errorf("the '%s' pattern doesn't exist", name) + } + ts[sem] = struct{}{} + } + if len(subs) == 0 { + r, err := regexp.Compile(expr) + if err != nil { + return nil, err + } + p := &Pattern{Regexp: r} + return p, nil + } + spl := patternRegexp.Split(expr, -1) + msi := make(map[string]int) + order := 1 // semantic order + var res string + for i := 0; i < len(spl)-1; i++ { + splPart := spl[i] + order += capCount(splPart) + sub := subs[i] + subName, subSem := split(sub) + p, err := h.compile(subName) + if err != nil { + return nil, err + } + sub = p.String() + subNumSubexp := p.NumSubexp() + subNumSubexp++ + sub = wrap(sub) + if subSem != "" { + msi[subSem] = order + } + res += splPart + sub + // add sub semantics to this semantics + for k, v := range p.s { + if _, ok := ts[k]; !ok { + msi[k] = order + v + } + } + order += subNumSubexp + } + res += spl[len(spl)-1] + r, err := regexp.Compile(res) + if err != nil { + return nil, err + } + p := &Pattern{Regexp: r} + p.s = msi + p.order = make(map[int]string) + for k, v := range msi { + p.order[v] = k + } + return p, nil +} + +func split(s string) (name, sem string) { + ss := patternRegexp.FindStringSubmatch(s) + if len(ss) >= 2 { + name = ss[1] + } + if len(ss) >= 4 { + sem = ss[3] + } + return +} + +func wrap(s string) string { return "(" + s + ")" } + +var ( + nonCapLeftRxp = regexp.MustCompile(`\(\?[imsU\-]*\:`) + nonCapFlagsRxp = regexp.MustCompile(`\(?[imsU\-]+\)`) +) + +func capCount(in string) int { + leftParens := strings.Count(in, "(") + nonCapLeft := len(nonCapLeftRxp.FindAllString(in, -1)) + nonCapBoth := len(nonCapFlagsRxp.FindAllString(in, -1)) + escapedLeftParens := strings.Count(in, `\(`) + return leftParens - nonCapLeft - nonCapBoth - escapedLeftParens +} + +// Get pattern by name from the Host. +func (h Host) Get(name string) (*Pattern, error) { + return h.compile(name) +} + +// Compile and get pattern without name (and without adding it to this Host) +func (h Host) Compile(expr string) (*Pattern, error) { + if expr == "" { + return nil, ErrEmptyExpression + } + return h.compileExternal(expr) +} + +type Pattern struct { + *regexp.Regexp + s map[string]int + order map[int]string + cache []string +} + +// Parse returns a map of matches on the input. The map can be empty. +func (p *Pattern) Parse(input string) map[string]string { + ss := p.FindStringSubmatch(input) + r := make(map[string]string) + if len(ss) <= 1 { + return r + } + for sem, order := range p.s { + r[sem] = ss[order] + } + return r +} + +func (p *Pattern) ParseValues(input string) []string { + a := p.FindStringSubmatchIndex(input) + if a == nil { + return nil + } + p.cache = p.cache[:0] + for i := 0; len(p.cache) < len(p.s); i++ { + if _, ok := p.order[i]; !ok { + continue + } + p.cache = append(p.cache, input[a[i*2]:a[i*2+1]]) + } + return p.cache +} + +// Names returns all names that this pattern has in order. +func (p *Pattern) Names() (ss []string) { + ss = make([]string, 0, len(p.s)) + for k := range p.s { + ss = append(ss, k) + } + sort.Slice(ss, func(i, j int) bool { + return p.s[ss[i]] < p.s[ss[j]] + }) + return +} + +// AddFromReader appends all patterns from the reader to this Host. +func (h Host) AddFromReader(r io.Reader) error { + scanner := bufio.NewScanner(r) + for scanner.Scan() { + if err := h.addFromLine(scanner.Text()); err != nil { + return err + } + } + if err := scanner.Err(); err != nil { + return err + } + return nil +} + +var lineRegexp = regexp.MustCompile(`^(\w+)\s+(.+)$`) + +func (h Host) addFromLine(line string) error { + sub := lineRegexp.FindStringSubmatch(line) + if len(sub) == 0 { // no match + return nil + } + return h.Add(sub[1], sub[2]) +} diff --git a/pkg/grok/host_test.go b/pkg/grok/host_test.go new file mode 100644 index 0000000000..acfd483e87 --- /dev/null +++ b/pkg/grok/host_test.go @@ -0,0 +1,100 @@ +package grok + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestNew(t *testing.T) { + h := New() + require.Len(t, h, 0) + require.NotNil(t, h) +} + +func TestHost_Add(t *testing.T) { + h := New() + require.ErrorIs(t, h.Add("", "expr"), ErrEmptyName) + require.Len(t, h, 0) + require.ErrorIs(t, h.Add("name", ""), ErrEmptyExpression) + require.Len(t, h, 0) + require.NoError(t, h.Add("DIGIT", `\d`)) + require.Len(t, h, 1) + require.ErrorIs(t, h.Add("DIGIT", `[+-](0x)?\d`), ErrAlreadyExist) + require.Len(t, h, 1) + require.Error(t, h.Add("BAD", `(?![0-5])`)) + require.Len(t, h, 1) + require.NoError(t, h.Add("TWODIG", `%{DIGIT}-%{DIGIT}`)) + require.Len(t, h, 2) + require.Error(t, h.Add("THREE", `%{NOT}-%{EXIST}`)) + require.Len(t, h, 2) + require.NoError(t, h.Add("FOUR", `%{DIGIT:one}-%{DIGIT:two}`)) + require.Len(t, h, 3) + require.Error(t, h.Add("FIVE", `(?!\d)%{DIGIT}(?!\d)`)) + require.Len(t, h, 3) + require.NoError(t, h.Add("SIX", `%{FOUR:four}-%{DIGIT:six}`)) + require.Len(t, h, 4) +} + +func TestHost_Compile(t *testing.T) { + h := New() + _, err := h.Compile("") + require.ErrorIs(t, err, ErrEmptyExpression) + require.Len(t, h, 0) + p, err := h.Compile(`\d+`) + require.NoError(t, err) + require.NotNil(t, p) + require.Len(t, h, 0) +} + +func TestHost_Get(t *testing.T) { + h := New() + require.NoError(t, h.Add("DIG", `\d`)) + p, err := h.Get("DIG") + require.NoError(t, err) + require.NotNil(t, p) + p, err = h.Get("SEVEN") + require.ErrorIs(t, err, ErrNotExist) + require.Nil(t, p) +} + +func TestHost_AddFromReader(t *testing.T) { + s := `# +# for testing +# +ONE \d +TWO %{ONE:two} +THREE %{ONE:one}-%{TWO}-%{ONE:three} + +# +# enough +#` + h := New() + require.NoError(t, h.AddFromReader(strings.NewReader(s))) + require.Len(t, h, 3) + _, err := h.Get("ONE") + require.NoError(t, err) + _, err = h.Get("TWO") + require.NoError(t, err) + _, err = h.Get("THREE") + require.NoError(t, err) +} + +func TestHost_AddFromReader_malformedPatterns(t *testing.T) { + s := ` +ONE \d +TWO %{THREE:two}` + require.Error(t, New().AddFromReader(strings.NewReader(s))) +} + +func TestHost_inject(t *testing.T) { + h := New() + h["TWO"] = `(?!\d)` + require.Error(t, h.Add("ONE", `%{TWO:one}`)) +} + +func TestHost_addFromLine(t *testing.T) { + h := New() + require.Error(t, h.addFromLine("ONE (?!\\d)")) +} diff --git a/pkg/grok/patterns_test.go b/pkg/grok/patterns_test.go new file mode 100644 index 0000000000..31123a1ae2 --- /dev/null +++ b/pkg/grok/patterns_test.go @@ -0,0 +1,76 @@ +package grok + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestPattern_Parse(t *testing.T) { + h := New() + require.NoError(t, h.Add("ONE", `\d`)) + require.NoError(t, h.Add("TWO", `%{ONE:one}-%{ONE:two}`)) + require.NoError(t, h.Add("THREE", `%{ONE:zero}-%{TWO:three}`)) + p, err := h.Get("ONE") + require.NoError(t, err) + require.NotNil(t, p.Parse("1")) + p, err = h.Get("TWO") + require.NoError(t, err) + require.Equal(t, map[string]string{"one": "1", "two": "2"}, p.Parse("1-2")) + p, err = h.Get("THREE") + require.NoError(t, err) + require.Equal(t, map[string]string{ + "one": "1", + "two": "2", + "zero": "0", + "three": "1-2", + }, p.Parse("0-1-2")) + require.NoError(t, h.Add("FOUR", `%{TWO:two}`)) + p, err = h.Get("FOUR") + require.NoError(t, err) + require.Equal(t, map[string]string{"one": "1", "two": "1-2"}, p.Parse("1-2")) +} + +func TestPattern_nestedGroups(t *testing.T) { + h := New() + require.NoError(t, h.Add("ONE", `\d`)) + require.NoError(t, h.Add("TWO", `(?:%{ONE:one})-(?:%{ONE:two})?`)) + p, err := h.Get("TWO") + require.NoError(t, err) + require.Equal(t, map[string]string{"one": "1", "two": "2"}, p.Parse("1-2")) + require.Equal(t, map[string]string{"one": "1", "two": ""}, p.Parse("1-")) +} + +func TestPattern_Names(t *testing.T) { + h := New() + require.NoError(t, h.Add("ONE", `\d`)) + require.NoError(t, h.Add("TWO", `%{ONE:one}-%{ONE:two}`)) + require.NoError(t, h.Add("THREE", `%{ONE:zero}-%{TWO:three}`)) + p, err := h.Get("THREE") + require.NoError(t, err) + require.Equal(t, []string{"zero", "three", "one", "two"}, p.Names()) +} + +func TestPattern_ParseValues(t *testing.T) { + h := NewBase() + p, err := h.Compile("%{TIMESTAMP_ISO8601:event_time} %{LOGLEVEL:log_level} %{GREEDYDATA:log_message}") + require.NoError(t, err) + ss := p.ParseValues("2020-09-16T04:20:42.45+01:00 DEBUG This is a sample debug log message") + require.Equal(t, []string{"2020-09-16T04:20:42.45+01:00", "DEBUG", "This is a sample debug log message"}, ss) +} + +func TestPattern_NamesIgnoreTypeCast(t *testing.T) { + h := New() + require.NoError(t, h.Add("ONE", `\d`)) + p, err := h.Compile("%{ONE:one:int}") + require.NoError(t, err) + require.Equal(t, []string{"one"}, p.Names()) +} + +func TestPattern_NamesNested(t *testing.T) { + h := New() + require.NoError(t, h.Add("ONE", `\d`)) + p, err := h.Compile("%{ONE:num.one}-%{ONE:[num][two]}") + require.NoError(t, err) + require.Equal(t, []string{"num.one", "[num][two]"}, p.Names()) +} diff --git a/runtime/expr/function/function.go b/runtime/expr/function/function.go index 46d1182294..e495511fdf 100644 --- a/runtime/expr/function/function.go +++ b/runtime/expr/function/function.go @@ -33,6 +33,9 @@ func New(zctx *zed.Context, name string, narg int) (expr.Function, field.Path, e // special grep form will make it look like a function call // and we don't want the error to say unknown function. return nil, nil, errors.New("syntax error") + case "grok": + argmin, argmax = 2, 3 + f = newGrok(zctx) case "len": f = &LenFn{zctx: zctx} case "abs": diff --git a/runtime/expr/function/grok.go b/runtime/expr/function/grok.go new file mode 100644 index 0000000000..23f85ca059 --- /dev/null +++ b/runtime/expr/function/grok.go @@ -0,0 +1,106 @@ +package function + +import ( + "fmt" + "strings" + + "github.com/brimdata/zed" + "github.com/brimdata/zed/pkg/grok" + "github.com/brimdata/zed/zcode" +) + +type Grok struct { + zctx *zed.Context + builder zcode.Builder + hosts map[string]*host +} + +func newGrok(zctx *zed.Context) *Grok { + return &Grok{ + zctx: zctx, + hosts: make(map[string]*host), + } +} + +func (g *Grok) Call(ectx zed.Allocator, vals []zed.Value) *zed.Value { + patternArg, inputArg, defArg := vals[0], vals[1], zed.NullString + if len(vals) == 3 { + defArg = &vals[2] + } + switch { + case zed.TypeUnder(defArg.Type) != zed.TypeString: + return g.error(ectx, "definitions argument must be a string", defArg) + case zed.TypeUnder(patternArg.Type) != zed.TypeString: + return g.error(ectx, "pattern argument must be a string", &patternArg) + case zed.TypeUnder(inputArg.Type) != zed.TypeString: + return g.error(ectx, "input argument must be a string", &inputArg) + } + h, err := g.getHost(defArg.AsString()) + if err != nil { + return g.error(ectx, err.Error(), defArg) + } + p, err := h.getPattern(g.zctx, patternArg.AsString()) + if err != nil { + return g.error(ectx, err.Error(), &patternArg) + } + ss := p.ParseValues(inputArg.AsString()) + if ss == nil { + return g.error(ectx, "value does not match pattern", &inputArg) + } + g.builder.Reset() + for _, s := range ss { + g.builder.Append([]byte(s)) + } + return ectx.NewValue(p.typ, g.builder.Bytes()) +} + +func (g *Grok) error(ectx zed.Allocator, err string, val *zed.Value) *zed.Value { + err = fmt.Sprintf("grok(): %s", err) + if val == nil { + return ectx.CopyValue(*g.zctx.NewErrorf(err)) + } + return ectx.CopyValue(*g.zctx.WrapError(err, val)) +} + +func (g *Grok) getHost(defs string) (*host, error) { + h, ok := g.hosts[defs] + if !ok { + h = &host{Host: grok.NewBase(), patterns: make(map[string]*pattern)} + if err := h.AddFromReader(strings.NewReader(defs)); err != nil { + return nil, err + } + g.hosts[defs] = h + } + return h, nil +} + +type host struct { + grok.Host + patterns map[string]*pattern +} + +func (h *host) getPattern(zctx *zed.Context, patternArg string) (*pattern, error) { + p, ok := h.patterns[patternArg] + if !ok { + pat, err := h.Host.Compile(patternArg) + if err != nil { + return nil, err + } + var fields []zed.Field + for _, name := range pat.Names() { + fields = append(fields, zed.NewField(name, zed.TypeString)) + } + typ, err := zctx.LookupTypeRecord(fields) + if err != nil { + return nil, err + } + p = &pattern{Pattern: pat, typ: typ} + h.patterns[patternArg] = p + } + return p, nil +} + +type pattern struct { + *grok.Pattern + typ zed.Type +} diff --git a/runtime/expr/function/ztests/grok.yaml b/runtime/expr/function/ztests/grok.yaml new file mode 100644 index 0000000000..4b088a6ce3 --- /dev/null +++ b/runtime/expr/function/ztests/grok.yaml @@ -0,0 +1,38 @@ +script: | + zq -z 'grok(pattern, field)' simple.zson + echo "// ===" + echo '"0-1-2"' | zq -z -I patterns.zed - + echo "// ===" + # Ignores type annotation. + echo '"0"' | zq -z 'grok("%{INT:int:int64}", this)' - + echo "// ===" + # Check to see that duplicate fields are squashed. This is not great but + # this is what grokconstructor.appspot.com does. + zq -z 'grok("%{INT:one} %{INT:one}", "1 2")' + echo "// ===" + echo '"string value"' | zq -z 'grok("%{INT:int}", this)' - + +inputs: + - name: simple.zson + data: | + { + field: "2020-09-16T04:20:42.45+01:00 DEBUG This is a sample debug log message", + pattern: "%{TIMESTAMP_ISO8601:event_time} %{LOGLEVEL:log_level} %{GREEDYDATA:log_message}" + } + - name: patterns.zed + data: | + const pattern = "ONE \\d\n" + "TWO %{ONE:one}-%{ONE:two}" + yield grok("%{ONE:zero}-%{TWO:three}", this, pattern) + +outputs: + - name: stdout + data: | + {event_time:"2020-09-16T04:20:42.45+01:00",log_level:"DEBUG",log_message:"This is a sample debug log message"} + // === + {zero:"0",three:"1-2",one:"1",two:"2"} + // === + {int:"0"} + // === + {one:"2"} + // === + error({message:"grok(): value does not match pattern",on:"string value"}) From 61dabe4153e73128f36c7d38990e9ac5847c2eda Mon Sep 17 00:00:00 2001 From: Noah Treuhaft Date: Tue, 5 Dec 2023 15:12:54 -0500 Subject: [PATCH 2/9] Rename pkg/grok/patterns_test.go to pattern_test.go This matches name of the original file in github.com/logrusorgru/grokky. --- pkg/grok/{patterns_test.go => pattern_test.go} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pkg/grok/{patterns_test.go => pattern_test.go} (100%) diff --git a/pkg/grok/patterns_test.go b/pkg/grok/pattern_test.go similarity index 100% rename from pkg/grok/patterns_test.go rename to pkg/grok/pattern_test.go From 7892664610af0901c17d54b82f3c841497e84704 Mon Sep 17 00:00:00 2001 From: Noah Treuhaft Date: Tue, 5 Dec 2023 15:16:38 -0500 Subject: [PATCH 3/9] Use base.go from github.com/logrusorgru/grokky It contains more patterns than https://github.com/vjeantet/grok/blob/master/patterns/grok-patterns. --- pkg/grok/base.go | 192 ++++++++++++++++++++++++++--------------- pkg/grok/gen.go | 49 ----------- pkg/grok/grok-patterns | 98 --------------------- pkg/grok/grok.go | 8 -- 4 files changed, 122 insertions(+), 225 deletions(-) delete mode 100644 pkg/grok/gen.go delete mode 100644 pkg/grok/grok-patterns diff --git a/pkg/grok/base.go b/pkg/grok/base.go index ee89950f2e..4629f4d1cf 100644 --- a/pkg/grok/base.go +++ b/pkg/grok/base.go @@ -1,75 +1,127 @@ -// Code generated by gen.go; DO NOT EDIT. - package grok +func must(err error) { + if err != nil { + panic(err) + } +} + +// Must is like Add but panics if the expression can't be parsed or +// the name is empty. +func (h Host) Must(name, expr string) { + must(h.Add(name, expr)) +} + +// NewBase creates new Host that filled up with base patterns. +// To see all base patterns open 'base.go' file. func NewBase() Host { - h := New() - h.Must("USERNAME", "[a-zA-Z0-9._-]+") - h.Must("USER", "%{USERNAME}") - h.Must("INT", "(?:[+-]?(?:[0-9]+))") - h.Must("BASE10NUM", "([+-]?(?:[0-9]+(?:\\.[0-9]+)?)|\\.[0-9]+)") - h.Must("NUMBER", "(?:%{BASE10NUM})") - h.Must("BASE16NUM", "[+-]?(?:0x)?(?:[0-9A-Fa-f]+)") - h.Must("BASE16FLOAT", "\\b[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\\.[0-9A-Fa-f]*)?)|(?:\\.[0-9A-Fa-f]+))\\b") - h.Must("POSINT", "\\b(?:[1-9][0-9]*)\\b") - h.Must("NONNEGINT", "\\b(?:[0-9]+)\\b") - h.Must("WORD", "\\b\\w+\\b") - h.Must("NOTSPACE", "\\S+") - h.Must("SPACE", "\\s*") - h.Must("DATA", ".*?") - h.Must("GREEDYDATA", ".*") - h.Must("QUOTEDSTRING", "\"([^\"\\\\]*(\\\\.[^\"\\\\]*)*)\"|\\'([^\\'\\\\]*(\\\\.[^\\'\\\\]*)*)\\'") - h.Must("UUID", "[A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12}") - h.Must("CISCOMAC", "(?:(?:[A-Fa-f0-9]{4}\\.){2}[A-Fa-f0-9]{4})") - h.Must("WINDOWSMAC", "(?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})") - h.Must("COMMONMAC", "(?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})") - h.Must("MAC", "(?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC})") - h.Must("IPV6", "((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:)))(%.+)?") - h.Must("IPV4", "(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)") - h.Must("IP", "(?:%{IPV6}|%{IPV4})") - h.Must("HOSTNAME", "\\b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\\.?|\\b)") - h.Must("HOST", "%{HOSTNAME}") - h.Must("IPORHOST", "(?:%{HOSTNAME}|%{IP})") - h.Must("HOSTPORT", "%{IPORHOST}:%{POSINT}") - h.Must("UNIXPATH", "(/[\\w_%!$@:.,-]?/?)(\\S+)?") - h.Must("WINPATH", "([A-Za-z]:|\\\\)(?:\\\\[^\\\\?*]*)+") - h.Must("PATH", "(?:%{UNIXPATH}|%{WINPATH})") - h.Must("TTY", "(?:/dev/(pts|tty([pq])?)(\\w+)?/?(?:[0-9]+))") - h.Must("URIPROTO", "[A-Za-z]+(\\+[A-Za-z+]+)?") - h.Must("URIHOST", "%{IPORHOST}(?::%{POSINT:port})?") - h.Must("URIPATH", "(?:/[A-Za-z0-9$.+!*'(){},~:;=@#%_\\-]*)+") - h.Must("URIPARAM", "\\?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\\-\\[\\]]*") - h.Must("URIPATHPARAM", "%{URIPATH}(?:%{URIPARAM})?") - h.Must("URI", "%{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?") - h.Must("MONTH", "\\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\\b") - h.Must("MONTHNUM", "(?:0?[1-9]|1[0-2])") - h.Must("MONTHDAY", "(?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9])") - h.Must("DAY", "(?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?)") - h.Must("YEAR", "(\\d\\d){1,2}") - h.Must("HOUR", "(?:2[0123]|[01]?[0-9])") - h.Must("MINUTE", "(?:[0-5][0-9])") - h.Must("SECOND", "(?:(?:[0-5][0-9]|60)(?:[:.,][0-9]+)?)") - h.Must("TIME", "([^0-9]?)%{HOUR}:%{MINUTE}(?::%{SECOND})([^0-9]?)") - h.Must("DATE_US", "%{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR}") - h.Must("DATE_EU", "%{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR}") - h.Must("ISO8601_TIMEZONE", "(?:Z|[+-]%{HOUR}(?::?%{MINUTE}))") - h.Must("ISO8601_SECOND", "(?:%{SECOND}|60)") - h.Must("TIMESTAMP_ISO8601", "%{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}?") - h.Must("DATE", "%{DATE_US}|%{DATE_EU}") - h.Must("DATESTAMP", "%{DATE}[- ]%{TIME}") - h.Must("TZ", "(?:[PMCE][SD]T|UTC|GMT)") - h.Must("DATESTAMP_RFC822", "%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ}") - h.Must("DATESTAMP_OTHER", "%{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR}") - h.Must("SYSLOGTIMESTAMP", "%{MONTH} +%{MONTHDAY} %{TIME}") - h.Must("PROG", "(?:[\\w._/%-]+)") - h.Must("SYSLOGPROG", "%{PROG:program}(?:\\[%{POSINT:pid}\\])?") - h.Must("SYSLOGHOST", "%{IPORHOST}") - h.Must("SYSLOGFACILITY", "<%{NONNEGINT:facility}.%{NONNEGINT:priority}>") - h.Must("HTTPDATE", "%{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT}") - h.Must("QS", "%{QUOTEDSTRING}") - h.Must("SYSLOGBASE", "%{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:") - h.Must("COMMONAPACHELOG", "%{IPORHOST:clientip} %{USER:ident} %{USER:auth} \\[%{HTTPDATE:timestamp}\\] \"(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})\" %{NUMBER:response} (?:%{NUMBER:bytes}|-)") - h.Must("COMBINEDAPACHELOG", "%{COMMONAPACHELOG} %{QS:referrer} %{QS:agent}") - h.Must("LOGLEVEL", "([A-a]lert|ALERT|[T|t]race|TRACE|[D|d]ebug|DEBUG|[N|n]otice|NOTICE|[I|i]nfo|INFO|[W|w]arn?(?:ing)?|WARN?(?:ING)?|[E|e]rr?(?:or)?|ERR?(?:OR)?|[C|c]rit?(?:ical)?|CRIT?(?:ICAL)?|[F|f]atal|FATAL|[S|s]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?)") + h := make(Host) + // + h.Must("USERNAME", `[a-zA-Z0-9._-]+`) + h.Must("USER", `%{USERNAME}`) + h.Must("EMAILLOCALPART", `[a-zA-Z][a-zA-Z0-9_.+-=:]+`) + h.Must("HOSTNAME", `\b[0-9A-Za-z][0-9A-Za-z-]{0,62}(?:\.[0-9A-Za-z][0-9A-Za-z-]{0,62})*(\.?|\b)`) + h.Must("EMAILADDRESS", `%{EMAILLOCALPART}@%{HOSTNAME}`) + h.Must("HTTPDUSER", `%{EMAILADDRESS}|%{USER}`) + h.Must("INT", `[+-]?(?:[0-9]+)`) + h.Must("BASE10NUM", `[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+))`) + h.Must("NUMBER", `%{BASE10NUM}`) + h.Must("BASE16NUM", `[+-]?(?:0x)?(?:[0-9A-Fa-f]+)`) + h.Must("BASE16FLOAT", `\b[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+))\b`) + // + h.Must("POSINT", `\b[1-9][0-9]*\b`) + h.Must("NONNEGINT", `\b[0-9]+\b`) + h.Must("WORD", `\b\w+\b`) + h.Must("NOTSPACE", `\S+`) + h.Must("SPACE", `\s*`) + h.Must("DATA", `.*?`) + h.Must("GREEDYDATA", `.*`) + h.Must("QUOTEDSTRING", `("(\\.|[^\\"]+)+")|""|('(\\.|[^\\']+)+')|''|`+ + "(`(\\\\.|[^\\\\`]+)+`)|``") + h.Must("UUID", `[A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12}`) + // Networking + h.Must("CISCOMAC", `(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4}`) + h.Must("WINDOWSMAC", `(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2}`) + h.Must("COMMONMAC", `(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2}`) + h.Must("MAC", `%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC}`) + h.Must("IPV6", `((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?`) + h.Must("IPV4", `(?:(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5]))`) + h.Must("IP", `%{IPV6}|%{IPV4}`) + h.Must("IPORHOST", `%{IP}|%{HOSTNAME}`) + h.Must("HOSTPORT", `%{IPORHOST}:%{POSINT}`) + + // paths + h.Must("UNIXPATH", `(/([\w_%!$@:.,~-]+|\\.)*)+`) + h.Must("TTY", `/dev/(pts|tty([pq])?)(\w+)?/?(?:[0-9]+)`) + h.Must("WINPATH", `(?:[A-Za-z]+:|\\)(?:\\[^\\?*]*)+`) + h.Must("PATH", `%{UNIXPATH}|%{WINPATH}`) + h.Must("URIPROTO", `[A-Za-z]+(\+[A-Za-z+]+)?`) + h.Must("URIHOST", `%{IPORHOST}(?::%{POSINT:port})?`) + // uripath comes loosely from RFC1738, but mostly from what Firefox + // doesn't turn into %XX + h.Must("URIPATH", `(?:/[A-Za-z0-9$.+!*'(){},~:;=@#%_\-]*)+`) + h.Must("URIPARAM", `\?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\-\[\]<>]*`) + h.Must("URIPATHPARAM", `%{URIPATH}(?:%{URIPARAM})?`) + h.Must("URI", `%{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?`) + // Months: January, Feb, 3, 03, 12, December + h.Must("MONTH", `\bJan(?:uary|uar)?|Feb(?:ruary|ruar)?|M(?:a|รค)?r(?:ch|z)?|Apr(?:il)?|Ma(?:y|i)?|Jun(?:e|i)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|O(?:c|k)?t(?:ober)?|Nov(?:ember)?|De(?:c|z)(?:ember)?\b`) + h.Must("MONTHNUM", `0?[1-9]|1[0-2]`) + h.Must("MONTHNUM2", `0[1-9]|1[0-2]`) + h.Must("MONTHDAY", `(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9]`) + // Days: Monday, Tue, Thu, etc... + h.Must("DAY", `Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?`) + // Years? + h.Must("YEAR", `(?:\d\d){1,2}`) + h.Must("HOUR", `2[0123]|[01]?[0-9]`) + h.Must("MINUTE", `[0-5][0-9]`) + // '60' is a leap second in most time standards and thus is valid. + h.Must("SECOND", `(?:[0-5]?[0-9]|60)(?:[:.,][0-9]+)?`) + h.Must("TIME", `%{HOUR}:%{MINUTE}:%{SECOND}`) + // datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it) + h.Must("DATE_US", `%{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR}`) + h.Must("DATE_EU", `%{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR}`) + // I really don't know how it's called + h.Must("DATE_X", `%{YEAR}/%{MONTHNUM2}/%{MONTHDAY}`) + h.Must("ISO8601_TIMEZONE", `Z|[+-]%{HOUR}(?::?%{MINUTE})`) + h.Must("ISO8601_SECOND", `%{SECOND}|60`) + h.Must("TIMESTAMP_ISO8601", `%{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}?`) + h.Must("DATE", `%{DATE_US}|%{DATE_EU}|%{DATE_X}`) + h.Must("DATESTAMP", `%{DATE}[- ]%{TIME}`) + h.Must("TZ", `[A-Z]{3}`) + h.Must("NUMTZ", `[+-]\d{4}`) + h.Must("DATESTAMP_RFC822", `%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ}`) + h.Must("DATESTAMP_RFC2822", `%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{TIME} %{ISO8601_TIMEZONE}`) + h.Must("DATESTAMP_OTHER", `%{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR}`) + h.Must("DATESTAMP_EVENTLOG", `%{YEAR}%{MONTHNUM2}%{MONTHDAY}%{HOUR}%{MINUTE}%{SECOND}`) + h.Must("HTTPDERROR_DATE", `%{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{YEAR}`) + // golang time patterns + h.Must("ANSIC", `%{DAY} %{MONTH} [_123]\d %{TIME} %{YEAR}"`) + h.Must("UNIXDATE", `%{DAY} %{MONTH} [_123]\d %{TIME} %{TZ} %{YEAR}`) + h.Must("RUBYDATE", `%{DAY} %{MONTH} [0-3]\d %{TIME} %{NUMTZ} %{YEAR}`) + h.Must("RFC822Z", `[0-3]\d %{MONTH} %{YEAR} %{TIME} %{NUMTZ}`) + h.Must("RFC850", `%{DAY}, [0-3]\d-%{MONTH}-%{YEAR} %{TIME} %{TZ}`) + h.Must("RFC1123", `%{DAY}, [0-3]\d %{MONTH} %{YEAR} %{TIME} %{TZ}`) + h.Must("RFC1123Z", `%{DAY}, [0-3]\d %{MONTH} %{YEAR} %{TIME} %{NUMTZ}`) + h.Must("RFC3339", `%{YEAR}-[01]\d-[0-3]\dT%{TIME}%{ISO8601_TIMEZONE}`) + h.Must("RFC3339NANO", `%{YEAR}-[01]\d-[0-3]\dT%{TIME}\.\d{9}%{ISO8601_TIMEZONE}`) + h.Must("KITCHEN", `\d{1,2}:\d{2}(AM|PM|am|pm)`) + // Syslog Dates: Month Day HH:MM:SS + h.Must("SYSLOGTIMESTAMP", `%{MONTH} +%{MONTHDAY} %{TIME}`) + h.Must("PROG", `[\x21-\x5a\x5c\x5e-\x7e]+`) + h.Must("SYSLOGPROG", `%{PROG:program}(?:\[%{POSINT:pid}\])?`) + h.Must("SYSLOGHOST", `%{IPORHOST}`) + h.Must("SYSLOGFACILITY", `<%{NONNEGINT:facility}.%{NONNEGINT:priority}>`) + h.Must("HTTPDATE", `%{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT}`) + // Shortcuts + h.Must("QS", `%{QUOTEDSTRING}`) + // Log Levels + h.Must("LOGLEVEL", `[Aa]lert|ALERT|[Tt]race|TRACE|[Dd]ebug|DEBUG|[Nn]otice|NOTICE|[Ii]nfo|INFO|[Ww]arn?(?:ing)?|WARN?(?:ING)?|[Ee]rr?(?:or)?|ERR?(?:OR)?|[Cc]rit?(?:ical)?|CRIT?(?:ICAL)?|[Ff]atal|FATAL|[Ss]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?`) + // Log formats + h.Must("SYSLOGBASE", `%{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:`) + h.Must("COMMONAPACHELOG", `%{IPORHOST:clientip} %{HTTPDUSER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] "(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})" %{NUMBER:response} (?:%{NUMBER:bytes}|-)`) + h.Must("COMBINEDAPACHELOG", `%{COMMONAPACHELOG} %{QS:referrer} %{QS:agent}`) + h.Must("HTTPD20_ERRORLOG", `\[%{HTTPDERROR_DATE:timestamp}\] \[%{LOGLEVEL:loglevel}\] (?:\[client %{IPORHOST:clientip}\] ){0,1}%{GREEDYDATA:errormsg}`) + h.Must("HTTPD24_ERRORLOG", `\[%{HTTPDERROR_DATE:timestamp}\] \[%{WORD:module}:%{LOGLEVEL:loglevel}\] \[pid %{POSINT:pid}:tid %{NUMBER:tid}\]( \(%{POSINT:proxy_errorcode}\)%{DATA:proxy_errormessage}:)?( \[client %{IPORHOST:client}:%{POSINT:clientport}\])? %{DATA:errorcode}: %{GREEDYDATA:message}`) + h.Must("HTTPD_ERRORLOG", `%{HTTPD20_ERRORLOG}|%{HTTPD24_ERRORLOG}`) return h } diff --git a/pkg/grok/gen.go b/pkg/grok/gen.go deleted file mode 100644 index 0d2d0bcc71..0000000000 --- a/pkg/grok/gen.go +++ /dev/null @@ -1,49 +0,0 @@ -//go:build ignore - -package main - -import ( - "bufio" - _ "embed" - "os" - "regexp" - "strings" - "text/template" -) - -//go:embed grok-patterns -var grokPatterns string - -const baseTemplate = `// Code generated by gen.go; DO NOT EDIT. - -package grok - -func NewBase() Host { - h := New(){{range .}}{{print "\n\t"}}h.Must({{printf "%q" .Name}}, {{printf "%q" .Pattern}}){{end}} - return h -} -` - -func must(err error) { - if err != nil { - panic(err) - } -} - -func main() { - type namedPattern struct{ Name, Pattern string } - var patterns []namedPattern - lineRegexp := regexp.MustCompile(`^(\w+)\s+(.+)$`) - scanner := bufio.NewScanner(strings.NewReader(grokPatterns)) - for scanner.Scan() { - if sub := lineRegexp.FindStringSubmatch(scanner.Text()); sub != nil { - patterns = append(patterns, namedPattern{Name: sub[1], Pattern: sub[2]}) - } - } - must(scanner.Err()) - f, err := os.Create("base.go") - must(err) - defer f.Close() - t := template.Must(template.New("base").Parse(baseTemplate)) - must(t.Execute(f, patterns)) -} diff --git a/pkg/grok/grok-patterns b/pkg/grok/grok-patterns deleted file mode 100644 index 40ec27920a..0000000000 --- a/pkg/grok/grok-patterns +++ /dev/null @@ -1,98 +0,0 @@ -# Adapted from https://github.com/vjeantet/grok/blob/master/patterns/grok-patterns -USERNAME [a-zA-Z0-9._-]+ -USER %{USERNAME} -INT (?:[+-]?(?:[0-9]+)) -BASE10NUM ([+-]?(?:[0-9]+(?:\.[0-9]+)?)|\.[0-9]+) -NUMBER (?:%{BASE10NUM}) -BASE16NUM [+-]?(?:0x)?(?:[0-9A-Fa-f]+) -BASE16FLOAT \b[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+))\b - -POSINT \b(?:[1-9][0-9]*)\b -NONNEGINT \b(?:[0-9]+)\b -WORD \b\w+\b -NOTSPACE \S+ -SPACE \s* -DATA .*? -GREEDYDATA .* -QUOTEDSTRING "([^"\\]*(\\.[^"\\]*)*)"|\'([^\'\\]*(\\.[^\'\\]*)*)\' -UUID [A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12} - -# Networking -CISCOMAC (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4}) -WINDOWSMAC (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2}) -COMMONMAC (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2}) -MAC (?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC}) -IPV6 ((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)? -IPV4 (?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?) -IP (?:%{IPV6}|%{IPV4}) -HOSTNAME \b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b) -HOST %{HOSTNAME} -IPORHOST (?:%{HOSTNAME}|%{IP}) -HOSTPORT %{IPORHOST}:%{POSINT} - -# paths -UNIXPATH (/[\w_%!$@:.,-]?/?)(\S+)? -WINPATH ([A-Za-z]:|\\)(?:\\[^\\?*]*)+ -PATH (?:%{UNIXPATH}|%{WINPATH}) -TTY (?:/dev/(pts|tty([pq])?)(\w+)?/?(?:[0-9]+)) - -URIPROTO [A-Za-z]+(\+[A-Za-z+]+)? -URIHOST %{IPORHOST}(?::%{POSINT:port})? -# uripath comes loosely from RFC1738, but mostly from what Firefox -# doesn't turn into %XX -URIPATH (?:/[A-Za-z0-9$.+!*'(){},~:;=@#%_\-]*)+ -#URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)? -URIPARAM \?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\-\[\]]* -URIPATHPARAM %{URIPATH}(?:%{URIPARAM})? -URI %{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})? - -# Months: January, Feb, 3, 03, 12, December -MONTH \b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b -MONTHNUM (?:0?[1-9]|1[0-2]) -MONTHDAY (?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9]) - -# Days: Monday, Tue, Thu, etc... -DAY (?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?) - -# Years? -#YEAR (?>\d\d){1,2} -#c -YEAR (\d\d){1,2} - -HOUR (?:2[0123]|[01]?[0-9]) -MINUTE (?:[0-5][0-9]) -# '60' is a leap second in most time standards and thus is valid. -SECOND (?:(?:[0-5][0-9]|60)(?:[:.,][0-9]+)?) -#TIME (?!<[0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9]) -#c -TIME ([^0-9]?)%{HOUR}:%{MINUTE}(?::%{SECOND})([^0-9]?) -# datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it) -DATE_US %{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR} -DATE_EU %{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR} -ISO8601_TIMEZONE (?:Z|[+-]%{HOUR}(?::?%{MINUTE})) -ISO8601_SECOND (?:%{SECOND}|60) -TIMESTAMP_ISO8601 %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}? -DATE %{DATE_US}|%{DATE_EU} -DATESTAMP %{DATE}[- ]%{TIME} -TZ (?:[PMCE][SD]T|UTC|GMT) -DATESTAMP_RFC822 %{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ} -DATESTAMP_OTHER %{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR} - -# Syslog Dates: Month Day HH:MM:SS -SYSLOGTIMESTAMP %{MONTH} +%{MONTHDAY} %{TIME} -PROG (?:[\w._/%-]+) -SYSLOGPROG %{PROG:program}(?:\[%{POSINT:pid}\])? -SYSLOGHOST %{IPORHOST} -SYSLOGFACILITY <%{NONNEGINT:facility}.%{NONNEGINT:priority}> -HTTPDATE %{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT} - -# Shortcuts -QS %{QUOTEDSTRING} - -# Log formats -SYSLOGBASE %{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}: -COMMONAPACHELOG %{IPORHOST:clientip} %{USER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] "(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})" %{NUMBER:response} (?:%{NUMBER:bytes}|-) -COMBINEDAPACHELOG %{COMMONAPACHELOG} %{QS:referrer} %{QS:agent} - -# Log Levels -LOGLEVEL ([A-a]lert|ALERT|[T|t]race|TRACE|[D|d]ebug|DEBUG|[N|n]otice|NOTICE|[I|i]nfo|INFO|[W|w]arn?(?:ing)?|WARN?(?:ING)?|[E|e]rr?(?:or)?|ERR?(?:OR)?|[C|c]rit?(?:ical)?|CRIT?(?:ICAL)?|[F|f]atal|FATAL|[S|s]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?) diff --git a/pkg/grok/grok.go b/pkg/grok/grok.go index d36633c37b..994b9c60ba 100644 --- a/pkg/grok/grok.go +++ b/pkg/grok/grok.go @@ -1,5 +1,3 @@ -//go:generate go run gen.go - // Adapted from github.com/logrusorgru/grokky package grok @@ -50,12 +48,6 @@ func (h Host) Add(name, expr string) error { return nil } -func (h Host) Must(name, expr string) { - if err := h.Add(name, expr); err != nil { - panic(fmt.Errorf("%s: %w", name, err)) - } -} - func (h Host) compile(name string) (*Pattern, error) { expr, ok := h[name] if !ok { From e81cd3ee8af4df2e5a9b4cf099d18c5066b2d533 Mon Sep 17 00:00:00 2001 From: Noah Treuhaft Date: Tue, 5 Dec 2023 15:19:03 -0500 Subject: [PATCH 4/9] Add attribution in all files --- pkg/grok/base.go | 2 ++ pkg/grok/grok.go | 3 ++- pkg/grok/host_test.go | 2 ++ pkg/grok/pattern_test.go | 2 ++ 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pkg/grok/base.go b/pkg/grok/base.go index 4629f4d1cf..1b0cfe9056 100644 --- a/pkg/grok/base.go +++ b/pkg/grok/base.go @@ -1,3 +1,5 @@ +// Adapted from https://github.com/logrusorgru/grokky/blob/f28bfe018565ac1e90d93502eae1170006dd1f48/base.go + package grok func must(err error) { diff --git a/pkg/grok/grok.go b/pkg/grok/grok.go index 994b9c60ba..b53f57ee52 100644 --- a/pkg/grok/grok.go +++ b/pkg/grok/grok.go @@ -1,4 +1,5 @@ -// Adapted from github.com/logrusorgru/grokky +// Adapted from https://github.com/logrusorgru/grokky/blob/f28bfe018565ac1e90d93502eae1170006dd1f48/grok.go + package grok import ( diff --git a/pkg/grok/host_test.go b/pkg/grok/host_test.go index acfd483e87..abf51ab3e7 100644 --- a/pkg/grok/host_test.go +++ b/pkg/grok/host_test.go @@ -1,3 +1,5 @@ +// Adapted from https://github.com/logrusorgru/grokky/blob/f28bfe018565ac1e90d93502eae1170006dd1f48/host_test.go + package grok import ( diff --git a/pkg/grok/pattern_test.go b/pkg/grok/pattern_test.go index 31123a1ae2..e3fdeec271 100644 --- a/pkg/grok/pattern_test.go +++ b/pkg/grok/pattern_test.go @@ -1,3 +1,5 @@ +// Adapted from https://github.com/logrusorgru/grokky/blob/f28bfe018565ac1e90d93502eae1170006dd1f48/pattern_test.go + package grok import ( From d0fbb317c4c367246560a947fdd5b7b2ed7f3266 Mon Sep 17 00:00:00 2001 From: Matthew Nibecker Date: Tue, 5 Dec 2023 16:39:27 -0700 Subject: [PATCH 5/9] Update docs/language/functions/grok.md Co-authored-by: Phil Rzewski --- docs/language/functions/grok.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/language/functions/grok.md b/docs/language/functions/grok.md index 8db8e12151..8a891a9701 100644 --- a/docs/language/functions/grok.md +++ b/docs/language/functions/grok.md @@ -11,15 +11,15 @@ grok(pattern: string, s: string, definitions: string) -> any ### Description -The _grok_ function parses a string using a grok pattern and returns -a record containing the parsed fields. The syntax for a grok pattern -is `{%pattern:field_name}` where _pattern_ is a the name of the pattern -to match text with and _field_name_ is resultant field name of the capture +The _grok_ function parses a string `s` using a grok pattern and returns +a record containing the parsed fields. The syntax for `pattern` +is `{%pattern:field_name}` where _pattern_ is the name of the pattern +to match in `s` and _field_name_ is the resultant field name of the capture value. -When provided with three arguments the third argument, definitions, is a string -of named patterns seperated by new lines in the format `PATTERN_NAME PATTERN`. -The named patterns can then be referenced in the grok pattern argument. +When provided with three arguments, `definitions` is a string +of named patterns in the format `PATTERN_NAME PATTERN` each separated by newlines. +The named patterns can then be referenced in the `pattern` argument. #### Included Patterns From c0866f116e8a9b6c48a382f1deab6a04eae0592b Mon Sep 17 00:00:00 2001 From: Matthew Nibecker Date: Tue, 5 Dec 2023 16:39:41 -0700 Subject: [PATCH 6/9] Update docs/language/functions/grok.md Co-authored-by: Noah Treuhaft --- docs/language/functions/grok.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/language/functions/grok.md b/docs/language/functions/grok.md index 8a891a9701..0cfe1d6740 100644 --- a/docs/language/functions/grok.md +++ b/docs/language/functions/grok.md @@ -25,7 +25,7 @@ The named patterns can then be referenced in the `pattern` argument. The _grok_ function by default includes a set of builtin named patterns that can be referenced in any pattern. The included named patterns can be seen -[here](https://raw.githubusercontent.com/brimdata/zed/main/pkg/grok/grok-patterns). +[here](https://raw.githubusercontent.com/brimdata/zed/main/pkg/grok/base.go). ### Examples From c5a024ed63f4353dea88a14009a2fc49eae35381 Mon Sep 17 00:00:00 2001 From: Matthew Nibecker Date: Tue, 5 Dec 2023 16:39:50 -0700 Subject: [PATCH 7/9] Update docs/language/functions/README.md Co-authored-by: Phil Rzewski --- docs/language/functions/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/language/functions/README.md b/docs/language/functions/README.md index 4bd878c34e..aa6752a10b 100644 --- a/docs/language/functions/README.md +++ b/docs/language/functions/README.md @@ -28,7 +28,7 @@ Zed's [primitive types](../../formats/zed.md#1-primitive-types), e.g., * [flatten](flatten.md) - transform a record into a flattened map * [floor](floor.md) - floor of a number * [grep](grep.md) - search strings inside of values -* [grok)](grok.md) - parse a string into a structured record +* [grok](grok.md) - parse a string into a structured record * [has](has.md) - test existence of values * [hex](hex.md) - encode/decode hexadecimal strings * [has_error](has_error.md) - test if a value has an error From cacf2d5674f36d55c46171c91051816adb4df3a8 Mon Sep 17 00:00:00 2001 From: Matthew Nibecker Date: Wed, 6 Dec 2023 10:36:50 -0700 Subject: [PATCH 8/9] Update docs/language/functions/grok.md Co-authored-by: Noah Treuhaft --- docs/language/functions/grok.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/language/functions/grok.md b/docs/language/functions/grok.md index 0cfe1d6740..6b158afb28 100644 --- a/docs/language/functions/grok.md +++ b/docs/language/functions/grok.md @@ -31,8 +31,8 @@ that can be referenced in any pattern. The included named patterns can be seen Parsing a simple log line using the builtin named patterns: ```mdtest-command -echo '"2020-09-16T04:20:42.45+01:00 DEBUG This is a sample debug log message"' \ - | zq -Z 'yield grok("%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:message}", this)' - +echo '"2020-09-16T04:20:42.45+01:00 DEBUG This is a sample debug log message"' | + zq -Z 'yield grok("%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:message}", this)' - ``` => ```mdtest-output From acd5e8f2e50a524b4afa15bc40dcf1b728f2f595 Mon Sep 17 00:00:00 2001 From: Matthew Nibecker Date: Wed, 6 Dec 2023 10:39:08 -0700 Subject: [PATCH 9/9] feedback --- docs/language/functions/grok.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/language/functions/grok.md b/docs/language/functions/grok.md index 6b158afb28..5a68885621 100644 --- a/docs/language/functions/grok.md +++ b/docs/language/functions/grok.md @@ -5,21 +5,21 @@ ### Synopsis ``` -grok(pattern: string, s: string) -> any -grok(pattern: string, s: string, definitions: string) -> any +grok(p: string, s: string) -> any +grok(p: string, s: string, definitions: string) -> any ``` ### Description -The _grok_ function parses a string `s` using a grok pattern and returns -a record containing the parsed fields. The syntax for `pattern` +The _grok_ function parses a string `s` using grok pattern `p` and returns +a record containing the parsed fields. The syntax for pattern `p` is `{%pattern:field_name}` where _pattern_ is the name of the pattern to match in `s` and _field_name_ is the resultant field name of the capture value. When provided with three arguments, `definitions` is a string of named patterns in the format `PATTERN_NAME PATTERN` each separated by newlines. -The named patterns can then be referenced in the `pattern` argument. +The named patterns can then be referenced in argument `p`. #### Included Patterns