commit: b4a83c313b6f36d6823fcf8fa370fad5c697dc8a
parent c343f9b2e86d9ba52d9fcb33033336e8a038b435
Author: Haelwenn (lanodan) Monnier <contact@hacktivis.me>
Date: Mon, 24 Feb 2025 13:18:15 +0100
lib/iso_parse: Add support for email datetimes
Diffstat:
4 files changed, 258 insertions(+), 62 deletions(-)
diff --git a/cmd/date.1.in b/cmd/date.1.in
@@ -28,7 +28,8 @@
When
.Nm
is invoked without arguments it displays the current datetime
-Otherwise, depending on the options specified, will print the datetime in a user-defined way.
+Otherwise, depending on the options specified,
+will print the datetime in a user-defined way.
.Sh OPTIONS
.Bl -tag -width Ds
.It Fl d Ar datetime
@@ -75,7 +76,7 @@ instead of current datetime.
.It Fl R
Set the default value of
.Ar format
-to match RFC5322 (Internet Message Format).
+to match RFC5322 (Email / Internet Message Format).
.It Ar mmddHHMM Ns Oo Oo Ar CC Oc Ns Ar yy Oc
Sets custom datetime, if
.Fl j
@@ -97,7 +98,8 @@ centuries aka %C
century-less years aka %y
.El
.Pp
-For example 072505542024 corresponds to 2024-07-25T05:54, as you can verify with the following command:
+For example 072505542024 corresponds to 2024-07-25T05:54,
+as you can verify with the following command:
.Dl date -j 072505542024 +%Y-%m-%dT%H:%M
.It Cm + Ns Ar format
Set the displayed datetime in
@@ -112,7 +114,7 @@ Otherwise defaults to
.Sh ENVIRONMENT
Look at the manual page of
.Xr strftime 3
-for the environment variables, typical ones are
+for the environment variables, typical ones are
.Ev TZ ,
.Ev LC_TIME
and
diff --git a/lib/iso_parse.c b/lib/iso_parse.c
@@ -18,6 +18,213 @@
#include <string.h> /* memset */
#include <time.h> /* strptime, tm */
+static const char *short_weekday_name[7] = {"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"};
+static const char *short_month_name[12] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"};
+
+// Parses [+|-]HH:?MM timezone offsets
+static char *
+tzoffset_parse(char *s, struct tm *time, const char **errstr)
+{
+#ifndef TZNAME_MAX
+#define TZNAME_MAX _POSIX_TZNAME_MAX
+#endif
+#if TZNAME_MAX < 5
+#error TZNAME_MAX is too small
+#endif
+ static char offname[TZNAME_MAX + 1] = "";
+
+ int neg;
+ if(s[0] == '+')
+ neg = 0;
+ else if(s[0] == '-')
+ neg = 1;
+ else
+ {
+ *errstr = "Invalid timezone offset, must start with + or -";
+ return NULL;
+ }
+
+ size_t offname_i = 0;
+ offname[offname_i++] = *s++;
+
+ if(isdigit(s[0]) && isdigit(s[1]))
+ {
+ time->tm_gmtoff = (s[0] - '0') * 36000 + (s[1] - '0') * 3600;
+ offname[offname_i++] = *s++;
+ offname[offname_i++] = *s++;
+ }
+ else
+ {
+ *errstr = "Invalid timezone offset, no digits after <+|->";
+ return NULL;
+ }
+
+ if(s[0] == ':') s++;
+
+ if(isdigit(s[0]) && isdigit(s[1]))
+ {
+ time->tm_gmtoff += (s[0] - '0') * 600 + (s[1] - '0') * 60;
+ offname[offname_i++] = *s++;
+ offname[offname_i++] = *s++;
+ }
+ else
+ {
+ *errstr = "Invalid timezone offset, no digits after <+|->HH[:]";
+ return NULL;
+ }
+
+ if(neg) time->tm_gmtoff = -time->tm_gmtoff;
+
+ offname[offname_i++] = '\0';
+
+ time->tm_isdst = 0;
+ time->tm_zone = offname;
+
+ return s;
+}
+
+// For iso_parse function
+// Sets *errstr to NULL when it isn't an email date-time
+//
+// Check if it could be Email / Internet Message Format datetime
+// - Ignores RFC822 (ARPA era, folding space, 2-digit year)
+// - Uses RFC5322 / RFC2822 with ignoring RFC822 obsolete formats (aka obs)
+//
+// RFC5322 and RFC2822 (no obs): "([ ]*Day,)[ ]*DD[ ]+Mon[ ]+YYYY[ ]+HH:MM(:SS)?[ ]+[+/-]hhmm"
+static char *
+email_date_parse(char *arg, struct tm *time, const char **errstr)
+{
+ // Kept free of strptime() due to update/overriding being undefined and
+ // requiring custom parsing, notably locale-free, which strptime() can't handle
+
+ for(; isspace(arg[0]); arg++)
+ ;
+
+ // Change `time` only right before returning in case datetime is invalid
+ struct tm tmp_time = *time;
+ tmp_time.tm_isdst = -1;
+ tmp_time.tm_wday = -1;
+
+ if(arg[3] == ',')
+ {
+ // Because %a/%A is locale-dependent, Sunday is tm_wday=0
+ for(size_t i = 0; i < 7; i++)
+ {
+ if(memcmp(arg, short_weekday_name[i], 3) == 0)
+ {
+ tmp_time.tm_wday = i;
+ break;
+ }
+ }
+
+ if(tmp_time.tm_wday == -1)
+ {
+ *errstr = "Failed parsing short weekday name";
+ errno = 0;
+ return NULL;
+ }
+
+ arg += 4;
+
+ for(; isspace(arg[0]); arg++)
+ ;
+ }
+
+ errno = 0;
+ int parsed = 0;
+ char month_name[4] = "";
+ if(sscanf(arg,
+ "%2d %3s %d %2d:%2d%n",
+ &tmp_time.tm_mday,
+ month_name,
+ &tmp_time.tm_year,
+ &tmp_time.tm_hour,
+ &tmp_time.tm_min,
+ &parsed) < 5)
+ {
+ if(errno == 0 || errno == EINVAL)
+ {
+ if(tmp_time.tm_wday == -1)
+ {
+ *errstr = NULL;
+ }
+ else
+ {
+ *errstr = "Failed parsing Email-datetime";
+ }
+ }
+ else
+ {
+ *errstr = strerror(errno);
+ errno = 0;
+ }
+ return NULL;
+ }
+
+ if(tmp_time.tm_year < 49)
+ {
+ tmp_time.tm_year += 100; // 2000-2049
+ }
+ else if(tmp_time.tm_year > 99)
+ {
+ tmp_time.tm_year -= 1900;
+ }
+
+ arg += parsed;
+
+ if(arg[0] == ':' && isdigit(arg[1]))
+ {
+ if(isdigit(arg[2]))
+ {
+ tmp_time.tm_sec = (arg[1] - '0') * 10 + (arg[2] - '0');
+ arg += 3;
+ }
+ else
+ {
+ tmp_time.tm_sec = arg[1] - '0';
+ arg += 2;
+ }
+ }
+
+ for(; isspace(arg[0]); arg++)
+ ;
+
+ // Consider that nobody is going to transmit a timezone name which isn't GMT
+ if(arg[0] == 'G' && arg[1] == 'M' && arg[2] == 'T' && (arg[3] == '\0' || isspace(arg[3])))
+ {
+ tmp_time.tm_isdst = 0;
+ tmp_time.tm_gmtoff = 0;
+ tmp_time.tm_zone = "UTC";
+ }
+ else
+ {
+ arg = tzoffset_parse(arg, &tmp_time, errstr);
+ if(arg == NULL) return NULL;
+ }
+
+ // Done extracting directly from arg
+
+ tmp_time.tm_mon = -1;
+ // Because %b/%B is locale-dependent
+ for(size_t i = 0; i < 12; i++)
+ {
+ if(memcmp(month_name, short_month_name[i], 3) == 0)
+ {
+ tmp_time.tm_mon = i;
+ break;
+ }
+ }
+ if(tmp_time.tm_mon < 0)
+ {
+ *errstr = "Failed parsing short month name";
+ errno = 0;
+ return NULL;
+ }
+
+ memcpy(time, &tmp_time, sizeof(tmp_time));
+ return arg;
+}
+
// Sets errstr on failure
// YYYY-MM-DD[T ]hh:mm:SS([,\.]frac)?(Z|[+\-]hh:?mm)?
char *
@@ -44,7 +251,18 @@ iso_parse(char *arg, struct tm *time, long *nsec, const char **errstr)
return endptr;
}
- // No %F in POSIX
+ char *ret = email_date_parse(arg, time, errstr);
+ if(ret != NULL || *errstr != NULL)
+ {
+ return ret;
+ }
+
+ // Try parsing as RFC3339 subset of ISO 8601:1988
+
+ // FIXME?: Calling strptime() multiple times is explicitly unspecified in POSIX.1-2024
+ // instead a single strptime() call should be done
+
+ // No %F in POSIX prior to POSIX.1-2024 (<https://www.austingroupbugs.net/view.php?id=920>)
char *s = strptime(arg, "%Y-%m-%d", time);
if(s == NULL)
@@ -105,64 +323,14 @@ iso_parse(char *arg, struct tm *time, long *nsec, const char **errstr)
{
if(s[0] == 'Z' && s[1] == '\0')
{
+ time->tm_isdst = 0;
time->tm_gmtoff = 0;
time->tm_zone = "UTC";
}
else
{
-#ifndef TZNAME_MAX
-#define TZNAME_MAX _POSIX_TZNAME_MAX
-#endif
-#if TZNAME_MAX < 5
-#error TZNAME_MAX is too small
-#endif
- static char offname[TZNAME_MAX + 1] = "";
-
- int neg;
- if(s[0] == '+')
- neg = 0;
- else if(s[0] == '-')
- neg = 1;
- else
- {
- *errstr = "Invalid timezone offset, must start with + or -";
- return NULL;
- }
-
- size_t offname_i = 0;
- offname[offname_i++] = *s++;
-
- if(isdigit(s[0]) && isdigit(s[1]))
- {
- time->tm_gmtoff = (s[0] - '0') * 36000 + (s[1] - '0') * 3600;
- offname[offname_i++] = *s++;
- offname[offname_i++] = *s++;
- }
- else
- {
- *errstr = "Invalid timezone offset, no digits after <+|->";
- return NULL;
- }
-
- if(s[0] == ':') s++;
-
- if(isdigit(s[0]) && isdigit(s[1]))
- {
- time->tm_gmtoff += (s[0] - '0') * 600 + (s[1] - '0') * 60;
- offname[offname_i++] = *s++;
- offname[offname_i++] = *s++;
- }
- else
- {
- *errstr = "Invalid timezone offset, no digits after <+|->HH[:]";
- return NULL;
- }
-
- if(neg) time->tm_gmtoff = -time->tm_gmtoff;
-
- offname[offname_i++] = '\0';
-
- time->tm_zone = offname;
+ s = tzoffset_parse(s, time, errstr);
+ if(s == NULL) return NULL;
}
}
diff --git a/lib/iso_parse.mdoc b/lib/iso_parse.mdoc
@@ -6,13 +6,21 @@
.\" .Fl d Ar datetime
.\" .so lib/iso_parse.mdoc
.\"
-Should be formatted either with a leading @ (at) symbol followed by
+Should be formatted either with a leading @ (at) symbol followed by
the Unix timestamp (number of seconds before and after 1970-01-01 00:00:00Z),
for example
.Ql @1698791420
corresponds to 2023-10-31 23:30:20 UTC
.Pp
-Or as
+Or as Email / "Internet Message Format" (RFC5322, RFC2822, RFC822), for example:
+.Bl -bullet -compact
+.It
+.Ql Fri, 21 Nov 1997 09:55:06 -0600
+.It
+.Ql 21 Nov 97 09:55:06 GMT
+.El
+.Pp
+Or as RFC3339 which looks like
.Ql YYYY-MM-DDThh:mm:SS[frac][tz] ,
where:
.Bl -tag -width Ds
@@ -33,7 +41,7 @@ When empty it corresponds to local time.
Otherwise it can be an UTC offset in the format
.Ql [+-]HH:?MM
or the letter
-.Qq Z ,
+.Qq Z ,
signifying UTC.
.El
.Pp
diff --git a/test-cmd/date.sh b/test-cmd/date.sh
@@ -3,7 +3,7 @@
# SPDX-License-Identifier: MPL-2.0
target="$(dirname "$0")/../cmd/date"
-plans=27
+plans=31
. "$(dirname "$0")/tap.sh"
. "$(dirname "$0")/init_env.sh"
@@ -76,6 +76,24 @@ t 'iso Nano-Seconds' '-u -d 2025-02-10T21:05:53,437742835+01:00 -Ins' '2025-02-1
t '+foo%%bar' '+foo%%bar' 'foo%bar
'
+t_args 'email RFC2822-RFC5322-example1' '1997-11-21 15:55:06+0000
+' -u -d 'Fri, 21 Nov 1997 09:55:06 -0600' '+%F %T%z'
+
+# date -u -d '1969-02-13 23:32:00-0330' '+%F %T%z'
+t_args 'email RFC2822-RFC5322-whitespace-oddities' '1969-02-14 03:02:00+0000
+' -u -d 'Thu,
+ 13
+ Feb
+ 1969
+ 23:32
+ -0330' '+%F %T%z'
+
+t_args 'email RFC2822-RFC5322-obsolete-dates' '1997-11-21 09:55:06+0000
+' -u -d '21 Nov 97 09:55:06 GMT' '+%F %T%z'
+
+t_args 'email Y2K' '2017-11-21 09:55:06+0000
+' -u -d '21 Nov 17 09:55:06 GMT' '+%F %T%z'
+
#usage="\
#date [-uR] [-d datetime] [+format]
#date [-uR] -f now_format now [+format]