-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
897 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
267 changes: 267 additions & 0 deletions
267
src/main/python/opensextant/resources/datetime_patterns_py.cfg
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,267 @@ | ||
/* | ||
* NOTICE | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
* ************************************************************************** | ||
* NOTICE | ||
* This software was produced for the U. S. Government under Contract No. | ||
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer | ||
* Software and Noncommercial Computer Software Documentation Clause | ||
* 252.227-7014 (JUN 1995) | ||
* | ||
* (c) 2009-2013 The MITRE Corporation. All Rights Reserved. | ||
* ************************************************************************** | ||
*/ | ||
|
||
// ALL Patterns below - defines, rules, etc. -- are for MATCHING. | ||
// Parsing of actual fields named in defines is done after matches are found. | ||
// Validation of parsed fields is last. | ||
|
||
# Well-known month abbreviations. | ||
#DEFINE MON_ABBREV JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEPT?|OCT|NOV|DEC | ||
|
||
# A name starting with 3 ASCII letters as above, but followed by other letters, possibly not English or ASCII. | ||
# Detection of month/day/year patterns with non-English month names is only a coincidence if they share a common prefix. | ||
# Locales for date patterns and language options could be explored further. But is beyond scope. | ||
#DEFINE MON_NAME JAN\w{0,6}|FEB\w{0,6}|MAR\w{0,2}|APR\w{0,3}|MAY|JUN\w{0,2}|JUL\w{0,2}|AUG\w{0,3}|SEP\w{0,6}|OCT\w{0,6}|NOV\w{0,6}|DEC\w{0,6} | ||
#DEFINE DAY_ENUM th|nd|rd|st | ||
|
||
# Fixed length fields | ||
// In all practicality, year is 1xxx or 2xxx. Years 0001 to 0999 not really considered. | ||
#DEFINE YEAR [12]\d{3} | ||
#DEFINE YY '?\d\d | ||
|
||
// Year/YY is 2-4 digits,... but could be 3. This is only used for matching. XTemp still validates matches. | ||
// '76 | ||
// | ||
#DEFINE YEARYY '?\d{2}|\d{4} | ||
#DEFINE MM [01]\d | ||
#DEFINE DD [0-3]\d | ||
#DEFINE SHORT_TZ [A-Z] | ||
|
||
#DEFINE hh [0-2]\d | ||
#DEFINE mm [0-5]\d | ||
#DEFINE ss [0-5]\d | ||
|
||
|
||
# Variable length | ||
#DEFINE DOM [0-3]?\d | ||
#DEFINE MONTH [01]?\d | ||
#DEFINE LONG_TZ [A-Z]{3,5} | ||
# Variable length Day/Month digits: | ||
#DEFINE DM1 [0-3]?\d | ||
#DEFINE DM2 [0-3]?\d | ||
|
||
#DEFINE OF of|Of|OF | ||
# Do not use DSEP? (that is an optional separator.) Most field-separated patterns would be too noisy. | ||
#DEFINE DSEP1 [-/.] | ||
#DEFINE DSEP2 [-/.] | ||
|
||
// ........................................ | ||
// Month, Day, Year patterns, MDY | ||
// ........................................ | ||
// FORM: DATE: MM/DD/YY | ||
#CLASS MDY opensextant.extractors.xtemporal.DateTimeMatch | ||
#RULE MDY 01 \b<DM1>/<DM2>/<YY>\b | ||
#TEST MDY 01 12/30/90 | ||
#TEST MDY 01 DATE: 12/30/90 | ||
#TEST MDY 01 DATE: 30/07/90 Jul 30, European locale | ||
#TEST MDY 01 30/12/15 Dec 30, European locale | ||
#TEST MDY 01 30/14/15 FAIL 14 is invalid | ||
#TEST MDY 01 13/30/90 FAIL bad MON | ||
#TEST MDY 01 12/32/90 FAIL bad DOM | ||
#TEST MDY 01 12/30/01 | ||
#TEST MDY 01 12/30/00 | ||
#TEST MDY 01 12/30/55 | ||
#TEST MDY 01 1/30/55 | ||
#TEST MDY 01 12/30/15 | ||
#TEST MDY 01 12/1/15 | ||
#TEST MDY 01 12/01/15 | ||
#TEST MDY 01 15/01/15 Jan 15, 2015 | ||
|
||
|
||
// FORM: DATE: MM/DD/YYYY | ||
#RULE MDY 02 \b<DM1>/<DM2>/<YEAR>\b | ||
#TEST MDY 02 12/30/1990 | ||
#TEST MDY 02 DATE: 12/30/1990 | ||
#TEST MDY 02 13/30/1990 FAIL bad MON | ||
#TEST MDY 02 12/32/1990 FAIL bad DOM | ||
#TEST MDY 02 12/30/2001 | ||
#TEST MDY 02 12/30/0000 FAIL bad YYYY | ||
#TEST MDY 02 12/30/1955 | ||
#TEST MDY 02 12/30/1915 | ||
|
||
// FORM: MMM DD, YYYY or MMM DD YYYY, MMM DD, YY, etc. | ||
#RULE MDY 03 \b<MON_NAME>[\s.]+<DOM>[\s,.]+<YEARYY>\b | ||
#TEST MDY 03 DEC 30, 1990 | ||
#TEST MDY 03 DEC 30 1990 | ||
#TEST MDY 03 DEC.30 1990 | ||
#TEST MDY 03 DEC. 30 1990 | ||
#TEST MDY 03 DEC.30.1990 | ||
#TEST MDY 03 DEC 30 90 | ||
#TEST MDY 03 DEC 30 990 FAIL bad year | ||
#TEST MDY 03 DEC 00 1990 FAIL bad DOM | ||
#TEST MDY 03 DEC 01 2300 FAIL ambiguous; time appears to be 2300 where year is expected | ||
#TEST MDY 03 DECEMBER 30 1990 | ||
#TEST MDY 03 DECMEBER 30 90 | ||
#TEST MDY 03 DECIEMBRE 30 1990 | ||
#TEST MDY 03 DECIEMBRE 00 1990 # FAIL no 00 day | ||
#TEST MDY 03 DECEMBER 01 2300 FAIL ambiguous; time appears to be 2300 where year is expected | ||
|
||
|
||
// FORM: MMM, YYYY or Month, YYYY comma optional. 4-digit year required | ||
#RULE MDY 04 \b<MON_NAME>[\s,.]+<YEAR>\b | ||
#TEST MDY 04 DEC 1990 | ||
#TEST MDY 04 DEC, 1990 | ||
#TEST MDY 04 DEC. 1990 | ||
#TEST MDY 04 DECEMBER, 1990 | ||
#TEST MDY 04 DECIEMBRE, 1990 | ||
#TEST MDY 04 DÉCEMBRE, 1990 | ||
#TEST MDY 04 DÉC, 1990 | ||
|
||
// FORM: MMM of YYYY | ||
#RULE MDY 04a \b<MON_NAME>\s+<OF>\s+<YEAR>\b | ||
#TEST MDY 04a DEC of 1990 | ||
#TEST MDY 04a DECEMBER of 1990 | ||
|
||
#RULE MDY 05 \b<DOM>\s+<MON_NAME>[\s,]+<YEARYY>\b | ||
#TEST MDY 05 30 DEC 1990 | ||
#TEST MDY 05 30 DEC 90 | ||
#TEST MDY 05 01 DEC 00 | ||
#TEST MDY 05 01 DEC 02 | ||
#TEST MDY 05 30 DECEMBER 1990 | ||
#TEST MDY 05 30 DECMEBER 1990 | ||
#TEST MDY 05 30 DECIEMBRE 1990 | ||
|
||
#RULE MDY 06a \b<MON_NAME>[\s.]+<DOM><DAY_ENUM>[\s,]+<YEAR>\b | ||
#TEST MDY 06a September 19th, 2017 | ||
#TEST MDY 06a September 19th, 17 # FAIL | ||
#TEST MDY 06a September 19 th, 17 # FAIL | ||
#TEST MDY 06a Sept. 19th, 2017 | ||
#TEST MDY 06a Sept 19th, 2017 | ||
#TEST MDY 06a Sept 1st, 2017 | ||
#TEST MDY 06a Sept 23rd, 2017 | ||
#TEST MDY 06a Sept 15th, 2017 | ||
#TEST MDY 06a Sept 22nd, 2017 | ||
|
||
#RULE MDY 06b \b<DOM><DAY_ENUM>\s+<OF>?\s*<MON_NAME>[\s,]+<YEAR>\b | ||
#TEST MDY 06b 19th September, 2017 | ||
#TEST MDY 06b 19th of September, 2017 | ||
|
||
#CLASS DMYT opensextant.extractors.xtemporal.DateTimeMatch | ||
#RULE DMYT 01 \b<DOM>\s+<MON_NAME>[\s,]+<YEARYY> <hh>:?<mm>\b | ||
#TEST DMYT 01 30 DEC 1990 0400 | ||
#TEST DMYT 01 30 DEC 90 0400 | ||
#TEST DMYT 01 11 JUN 14 1815 06:15 PM, 11 JUNE 2014 | ||
#TEST DMYT 01 25 March 2012 04:00 | ||
#TEST DMYT 01 25 March, 2012 04:00 | ||
|
||
|
||
|
||
// FORM: DATE: DD-MON-YYYY | ||
#CLASS DMY opensextant.extractors.xtemporal.DateTimeMatch | ||
#RULE DMY 01 \b<DOM>-<MON_NAME>-<YEARYY>\b | ||
#TEST DMY 01 12-DEC-90 | ||
#TEST DMY 01 12-DEC-1990 | ||
|
||
// FORM: DATE: DD MON YYYY | ||
#RULE DMY 02 \b<DOM>\s*<MON_NAME>\s*<YEARYY>\b | ||
#TEST DMY 02 12 DEC 90 | ||
#TEST DMY 02 12 DEC 1990 | ||
#TEST DMY 02 12DEC90 | ||
#TEST DMY 02 12DEC1990 | ||
#TEST DMY 02 12MARCH1999 | ||
#TEST DMY 02 12FEBBRAIO1999 | ||
#TEST DMY 02 12JUL1999 | ||
#TEST DMY 02 12JULIO1999 | ||
|
||
// FORM: DATE: YYYY-MM-DD as it appears in free text of documents. | ||
// The limitations of this pattern are related to how it was used. | ||
// This is a relatively modern format; was this format used in text in 1700s? | ||
#CLASS YMD opensextant.extractors.xtemporal.DateTimeMatch | ||
#RULE YMD 01 \b<YEAR><DSEP1><MM><DSEP2><DOM>\b | ||
#TEST YMD 01 2001-11-11 | ||
#TEST YMD 01 0001-04-34 # FAIL | ||
#TEST YMD 01 1001-04-30 # FAIL | ||
#TEST YMD 01 2001-04-30 | ||
#TEST YMD 01 1990-04-30 | ||
#TEST YMD 01 1790-04-30 # FAIL -- 1800 01 01 is earliest date for this pattern. | ||
#TEST YMD 01 a2001-04-30 # FAIL | ||
#TEST YMD 01 c2001-04-30 # FAIL | ||
#TEST YMD 01 42001-04-30 # FAIL | ||
|
||
|
||
// ........................................ | ||
// DATE TIME PATTERNS, DTM | ||
// ........................................ | ||
#CLASS DTM opensextant.extractors.xtemporal.DateTimeMatch | ||
|
||
// FORM: A|O|P|R DDHHMMZ MMM YY | ||
#RULE DTM 01 \b<DD><hh><mm><SHORT_TZ>\s*<MON_ABBREV>\s*<YY>\b | ||
#TEST DTM 01 A 301400Z DEC 90 | ||
#TEST DTM 01 R 301400Z DEC 90 | ||
#TEST DTM 01 A 351400Z DEC 90 # FAIL day out of range | ||
|
||
// FORM: YYYYMMDDTHHMMZ | ||
#RULE DTM 02 \b<YEAR><MM><DD>T<hh><mm><SHORT_TZ>\b | ||
#TEST DTM 02 20101230T1400Z | ||
|
||
// FORM: YYYYMMDDTHHMM ZZZ | ||
#RULE DTM 02a \b<YEAR><MM><DD>T<hh><mm> <LONG_TZ>\b | ||
#TEST DTM 02a 20101230T1400 EST # UTC-0500. parses to 1900 UTC | ||
#TEST DTM 02a 20101230T1400 BNT # UTC+0800. parses to 0600 UTC | ||
#TEST DTM 02a 20101230T1400 XXX # FAIL -- invalid TZ | ||
#TEST DTM 02a 20101230T1400 PST # UTC-0800. parses to UTC | ||
|
||
// FORM: YYYY-MM-DDTHH:MM:SS ... ISO Time. ISO 8601 uses "-", not "/". But should be validated in normalization. | ||
// DTM 04 is collapsed into this pattern. | ||
#RULE DTM 03 \b<YEAR><DSEP1><MM><DSEP2><DD>[T ]<hh>:<mm>:<ss> | ||
#TEST DTM 03 2010-12-30T14:00:01:12 | ||
#TEST DTM 03 2010-12-30T14:00:02 | ||
|
||
// FORM: YYYY-MM-DDTHH:MM ... ISO Time. See 03 above. | ||
#RULE DTM 03b \b<YEAR><DSEP1><MM><DSEP2><DD>[T ]<hh>:<mm> | ||
#TEST DTM 03b 2010-12-30T14:01:11:12 | ||
#TEST DTM 03b 2010-12-30T14:02:12 | ||
#TEST DTM 03b 2010-12/30T14:03:13 # FAIL | ||
|
||
// FORM: MM/DD/YY* HH:MM:SS. | ||
// TODO: 12-hour clock time and detect PM/AM. This HH:MM is only 24 hour clock. | ||
#RULE DTM 05a \b<MONTH><DSEP1><DOM><DSEP2><YEARYY>\s?<hh>:<mm>:<ss> | ||
#TEST DTM 05a 12-30-20 14:00:00:12 # extra data associated with value? | ||
#TEST DTM 05a 12-30-2020 14:00:01 | ||
#TEST DTM 05a 12/30-2020 14:00:02 # FAIL Test mixed punctuation. | ||
#TEST DTM 05a 12/30/2020 14:00:02 # Test mixed punctuation. | ||
#TEST DTM 05a 12/30/20 14:00 # Test mixed punctuation. | ||
|
||
// FORM: MM/DD/YY* HH:MM. | ||
// TODO: 12-hour clock time and detect PM/AM. This HH:MM is only 24 hour clock. | ||
#RULE DTM 05b \b<MONTH><DSEP1><DOM><DSEP2><YEARYY>\s?<hh>:<mm> | ||
#TEST DTM 05b 12-30-20 14:00:00:12 # extra data associated with value? | ||
#TEST DTM 05b 12-30-2020 14:01:01 | ||
#TEST DTM 05b 12/30-2020 14:02:02 # FAIL Test mixed punctuation. | ||
#TEST DTM 05b 12/30/2020 14:03:02 # Test mixed punctuation. | ||
#TEST DTM 05b 12/30/20 14:04 # Test mixed punctuation. | ||
#TEST DTM 05b 12.30.20 14:04 # European convention for Date. | ||
#TEST DTM 05b 2.30.20 14:04 # FAIL No 30 FEB. European convention for Date. Non-zero padded | ||
#TEST DTM 05b 2.30/2020 14:04 # FAIL No 30 FEB., Mixed Separators. European convention for Date. Non-zero padded. | ||
#TEST DTM 05b 4.30.20 14:04 # European convention for Date. Non-zero padded | ||
#TEST DTM 05b 4.30/2020 14:04 # FAIL Mixed Separators. European convention for Date. Non-zero padded. | ||
|
||
|
||
// 8-digit date is great when used in short spans of text; But as a general pattern, this matches any 8 digit number. | ||
// Not very accurate on data or large texts. | ||
// FORM: YYYYMMDD | ||
//#RULE DTM xx \b<YEAR><MM><DD>\b | ||
//#TEST DTM xx 20101230 | ||
|
Oops, something went wrong.