Skip to content

Commit

Permalink
Release of python v1.6.2
Browse files Browse the repository at this point in the history
  • Loading branch information
mubaldino committed Jun 11, 2024
1 parent 5991d29 commit 2a77424
Show file tree
Hide file tree
Showing 3 changed files with 897 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/main/python/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
include LICENSE
# NOTE - all items here are copied from Java src/main/resources (ant script)
# except -- *_py.cfg for xtemp and xcoord extractors
include opensextant/resources/geocoord_patterns_py.cfg
include opensextant/resources/country-names-2021.csv
include opensextant/resources/country-names-2015.csv
Expand Down
267 changes: 267 additions & 0 deletions src/main/python/opensextant/resources/datetime_patterns_py.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
/*
* NOTICE
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* **************************************************************************
* NOTICE
* This software was produced for the U. S. Government under Contract No.
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
* Software and Noncommercial Computer Software Documentation Clause
* 252.227-7014 (JUN 1995)
*
* (c) 2009-2013 The MITRE Corporation. All Rights Reserved.
* **************************************************************************
*/

// ALL Patterns below - defines, rules, etc. -- are for MATCHING.
// Parsing of actual fields named in defines is done after matches are found.
// Validation of parsed fields is last.

# Well-known month abbreviations.
#DEFINE MON_ABBREV JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEPT?|OCT|NOV|DEC

# A name starting with 3 ASCII letters as above, but followed by other letters, possibly not English or ASCII.
# Detection of month/day/year patterns with non-English month names is only a coincidence if they share a common prefix.
# Locales for date patterns and language options could be explored further. But is beyond scope.
#DEFINE MON_NAME JAN\w{0,6}|FEB\w{0,6}|MAR\w{0,2}|APR\w{0,3}|MAY|JUN\w{0,2}|JUL\w{0,2}|AUG\w{0,3}|SEP\w{0,6}|OCT\w{0,6}|NOV\w{0,6}|DEC\w{0,6}
#DEFINE DAY_ENUM th|nd|rd|st

# Fixed length fields
// In all practicality, year is 1xxx or 2xxx. Years 0001 to 0999 not really considered.
#DEFINE YEAR [12]\d{3}
#DEFINE YY '?\d\d

// Year/YY is 2-4 digits,... but could be 3. This is only used for matching. XTemp still validates matches.
// '76
//
#DEFINE YEARYY '?\d{2}|\d{4}
#DEFINE MM [01]\d
#DEFINE DD [0-3]\d
#DEFINE SHORT_TZ [A-Z]

#DEFINE hh [0-2]\d
#DEFINE mm [0-5]\d
#DEFINE ss [0-5]\d


# Variable length
#DEFINE DOM [0-3]?\d
#DEFINE MONTH [01]?\d
#DEFINE LONG_TZ [A-Z]{3,5}
# Variable length Day/Month digits:
#DEFINE DM1 [0-3]?\d
#DEFINE DM2 [0-3]?\d

#DEFINE OF of|Of|OF
# Do not use DSEP? (that is an optional separator.) Most field-separated patterns would be too noisy.
#DEFINE DSEP1 [-/.]
#DEFINE DSEP2 [-/.]

// ........................................
// Month, Day, Year patterns, MDY
// ........................................
// FORM: DATE: MM/DD/YY
#CLASS MDY opensextant.extractors.xtemporal.DateTimeMatch
#RULE MDY 01 \b<DM1>/<DM2>/<YY>\b
#TEST MDY 01 12/30/90
#TEST MDY 01 DATE: 12/30/90
#TEST MDY 01 DATE: 30/07/90 Jul 30, European locale
#TEST MDY 01 30/12/15 Dec 30, European locale
#TEST MDY 01 30/14/15 FAIL 14 is invalid
#TEST MDY 01 13/30/90 FAIL bad MON
#TEST MDY 01 12/32/90 FAIL bad DOM
#TEST MDY 01 12/30/01
#TEST MDY 01 12/30/00
#TEST MDY 01 12/30/55
#TEST MDY 01 1/30/55
#TEST MDY 01 12/30/15
#TEST MDY 01 12/1/15
#TEST MDY 01 12/01/15
#TEST MDY 01 15/01/15 Jan 15, 2015


// FORM: DATE: MM/DD/YYYY
#RULE MDY 02 \b<DM1>/<DM2>/<YEAR>\b
#TEST MDY 02 12/30/1990
#TEST MDY 02 DATE: 12/30/1990
#TEST MDY 02 13/30/1990 FAIL bad MON
#TEST MDY 02 12/32/1990 FAIL bad DOM
#TEST MDY 02 12/30/2001
#TEST MDY 02 12/30/0000 FAIL bad YYYY
#TEST MDY 02 12/30/1955
#TEST MDY 02 12/30/1915

// FORM: MMM DD, YYYY or MMM DD YYYY, MMM DD, YY, etc.
#RULE MDY 03 \b<MON_NAME>[\s.]+<DOM>[\s,.]+<YEARYY>\b
#TEST MDY 03 DEC 30, 1990
#TEST MDY 03 DEC 30 1990
#TEST MDY 03 DEC.30 1990
#TEST MDY 03 DEC. 30 1990
#TEST MDY 03 DEC.30.1990
#TEST MDY 03 DEC 30 90
#TEST MDY 03 DEC 30 990 FAIL bad year
#TEST MDY 03 DEC 00 1990 FAIL bad DOM
#TEST MDY 03 DEC 01 2300 FAIL ambiguous; time appears to be 2300 where year is expected
#TEST MDY 03 DECEMBER 30 1990
#TEST MDY 03 DECMEBER 30 90
#TEST MDY 03 DECIEMBRE 30 1990
#TEST MDY 03 DECIEMBRE 00 1990 # FAIL no 00 day
#TEST MDY 03 DECEMBER 01 2300 FAIL ambiguous; time appears to be 2300 where year is expected


// FORM: MMM, YYYY or Month, YYYY comma optional. 4-digit year required
#RULE MDY 04 \b<MON_NAME>[\s,.]+<YEAR>\b
#TEST MDY 04 DEC 1990
#TEST MDY 04 DEC, 1990
#TEST MDY 04 DEC. 1990
#TEST MDY 04 DECEMBER, 1990
#TEST MDY 04 DECIEMBRE, 1990
#TEST MDY 04 DÉCEMBRE, 1990
#TEST MDY 04 DÉC, 1990

// FORM: MMM of YYYY
#RULE MDY 04a \b<MON_NAME>\s+<OF>\s+<YEAR>\b
#TEST MDY 04a DEC of 1990
#TEST MDY 04a DECEMBER of 1990

#RULE MDY 05 \b<DOM>\s+<MON_NAME>[\s,]+<YEARYY>\b
#TEST MDY 05 30 DEC 1990
#TEST MDY 05 30 DEC 90
#TEST MDY 05 01 DEC 00
#TEST MDY 05 01 DEC 02
#TEST MDY 05 30 DECEMBER 1990
#TEST MDY 05 30 DECMEBER 1990
#TEST MDY 05 30 DECIEMBRE 1990

#RULE MDY 06a \b<MON_NAME>[\s.]+<DOM><DAY_ENUM>[\s,]+<YEAR>\b
#TEST MDY 06a September 19th, 2017
#TEST MDY 06a September 19th, 17 # FAIL
#TEST MDY 06a September 19 th, 17 # FAIL
#TEST MDY 06a Sept. 19th, 2017
#TEST MDY 06a Sept 19th, 2017
#TEST MDY 06a Sept 1st, 2017
#TEST MDY 06a Sept 23rd, 2017
#TEST MDY 06a Sept 15th, 2017
#TEST MDY 06a Sept 22nd, 2017

#RULE MDY 06b \b<DOM><DAY_ENUM>\s+<OF>?\s*<MON_NAME>[\s,]+<YEAR>\b
#TEST MDY 06b 19th September, 2017
#TEST MDY 06b 19th of September, 2017

#CLASS DMYT opensextant.extractors.xtemporal.DateTimeMatch
#RULE DMYT 01 \b<DOM>\s+<MON_NAME>[\s,]+<YEARYY> <hh>:?<mm>\b
#TEST DMYT 01 30 DEC 1990 0400
#TEST DMYT 01 30 DEC 90 0400
#TEST DMYT 01 11 JUN 14 1815 06:15 PM, 11 JUNE 2014
#TEST DMYT 01 25 March 2012 04:00
#TEST DMYT 01 25 March, 2012 04:00



// FORM: DATE: DD-MON-YYYY
#CLASS DMY opensextant.extractors.xtemporal.DateTimeMatch
#RULE DMY 01 \b<DOM>-<MON_NAME>-<YEARYY>\b
#TEST DMY 01 12-DEC-90
#TEST DMY 01 12-DEC-1990

// FORM: DATE: DD MON YYYY
#RULE DMY 02 \b<DOM>\s*<MON_NAME>\s*<YEARYY>\b
#TEST DMY 02 12 DEC 90
#TEST DMY 02 12 DEC 1990
#TEST DMY 02 12DEC90
#TEST DMY 02 12DEC1990
#TEST DMY 02 12MARCH1999
#TEST DMY 02 12FEBBRAIO1999
#TEST DMY 02 12JUL1999
#TEST DMY 02 12JULIO1999

// FORM: DATE: YYYY-MM-DD as it appears in free text of documents.
// The limitations of this pattern are related to how it was used.
// This is a relatively modern format; was this format used in text in 1700s?
#CLASS YMD opensextant.extractors.xtemporal.DateTimeMatch
#RULE YMD 01 \b<YEAR><DSEP1><MM><DSEP2><DOM>\b
#TEST YMD 01 2001-11-11
#TEST YMD 01 0001-04-34 # FAIL
#TEST YMD 01 1001-04-30 # FAIL
#TEST YMD 01 2001-04-30
#TEST YMD 01 1990-04-30
#TEST YMD 01 1790-04-30 # FAIL -- 1800 01 01 is earliest date for this pattern.
#TEST YMD 01 a2001-04-30 # FAIL
#TEST YMD 01 c2001-04-30 # FAIL
#TEST YMD 01 42001-04-30 # FAIL


// ........................................
// DATE TIME PATTERNS, DTM
// ........................................
#CLASS DTM opensextant.extractors.xtemporal.DateTimeMatch

// FORM: A|O|P|R DDHHMMZ MMM YY
#RULE DTM 01 \b<DD><hh><mm><SHORT_TZ>\s*<MON_ABBREV>\s*<YY>\b
#TEST DTM 01 A 301400Z DEC 90
#TEST DTM 01 R 301400Z DEC 90
#TEST DTM 01 A 351400Z DEC 90 # FAIL day out of range

// FORM: YYYYMMDDTHHMMZ
#RULE DTM 02 \b<YEAR><MM><DD>T<hh><mm><SHORT_TZ>\b
#TEST DTM 02 20101230T1400Z

// FORM: YYYYMMDDTHHMM ZZZ
#RULE DTM 02a \b<YEAR><MM><DD>T<hh><mm> <LONG_TZ>\b
#TEST DTM 02a 20101230T1400 EST # UTC-0500. parses to 1900 UTC
#TEST DTM 02a 20101230T1400 BNT # UTC+0800. parses to 0600 UTC
#TEST DTM 02a 20101230T1400 XXX # FAIL -- invalid TZ
#TEST DTM 02a 20101230T1400 PST # UTC-0800. parses to UTC

// FORM: YYYY-MM-DDTHH:MM:SS ... ISO Time. ISO 8601 uses "-", not "/". But should be validated in normalization.
// DTM 04 is collapsed into this pattern.
#RULE DTM 03 \b<YEAR><DSEP1><MM><DSEP2><DD>[T ]<hh>:<mm>:<ss>
#TEST DTM 03 2010-12-30T14:00:01:12
#TEST DTM 03 2010-12-30T14:00:02

// FORM: YYYY-MM-DDTHH:MM ... ISO Time. See 03 above.
#RULE DTM 03b \b<YEAR><DSEP1><MM><DSEP2><DD>[T ]<hh>:<mm>
#TEST DTM 03b 2010-12-30T14:01:11:12
#TEST DTM 03b 2010-12-30T14:02:12
#TEST DTM 03b 2010-12/30T14:03:13 # FAIL

// FORM: MM/DD/YY* HH:MM:SS.
// TODO: 12-hour clock time and detect PM/AM. This HH:MM is only 24 hour clock.
#RULE DTM 05a \b<MONTH><DSEP1><DOM><DSEP2><YEARYY>\s?<hh>:<mm>:<ss>
#TEST DTM 05a 12-30-20 14:00:00:12 # extra data associated with value?
#TEST DTM 05a 12-30-2020 14:00:01
#TEST DTM 05a 12/30-2020 14:00:02 # FAIL Test mixed punctuation.
#TEST DTM 05a 12/30/2020 14:00:02 # Test mixed punctuation.
#TEST DTM 05a 12/30/20 14:00 # Test mixed punctuation.

// FORM: MM/DD/YY* HH:MM.
// TODO: 12-hour clock time and detect PM/AM. This HH:MM is only 24 hour clock.
#RULE DTM 05b \b<MONTH><DSEP1><DOM><DSEP2><YEARYY>\s?<hh>:<mm>
#TEST DTM 05b 12-30-20 14:00:00:12 # extra data associated with value?
#TEST DTM 05b 12-30-2020 14:01:01
#TEST DTM 05b 12/30-2020 14:02:02 # FAIL Test mixed punctuation.
#TEST DTM 05b 12/30/2020 14:03:02 # Test mixed punctuation.
#TEST DTM 05b 12/30/20 14:04 # Test mixed punctuation.
#TEST DTM 05b 12.30.20 14:04 # European convention for Date.
#TEST DTM 05b 2.30.20 14:04 # FAIL No 30 FEB. European convention for Date. Non-zero padded
#TEST DTM 05b 2.30/2020 14:04 # FAIL No 30 FEB., Mixed Separators. European convention for Date. Non-zero padded.
#TEST DTM 05b 4.30.20 14:04 # European convention for Date. Non-zero padded
#TEST DTM 05b 4.30/2020 14:04 # FAIL Mixed Separators. European convention for Date. Non-zero padded.


// 8-digit date is great when used in short spans of text; But as a general pattern, this matches any 8 digit number.
// Not very accurate on data or large texts.
// FORM: YYYYMMDD
//#RULE DTM xx \b<YEAR><MM><DD>\b
//#TEST DTM xx 20101230

Loading

0 comments on commit 2a77424

Please sign in to comment.