From 5e1978ea088862e7b852516eb07cd159aecb7b69 Mon Sep 17 00:00:00 2001 From: GERZAC1002 Date: Mon, 11 Apr 2022 08:55:32 +0200 Subject: [PATCH 1/3] Merged Add support to backup pages using API:Query instead of Special:Export https://github.com/WikiTeam/wikiteam/pull/280 into recent version of dumpgenerator.py Adds additional parameter: --apiexport which uses a query request instead of submit on api.php which works without Special:Export which is disabled on some sites. --- dumpgenerator.py | 252 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 243 insertions(+), 9 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index bd27ff17..a2d5fabc 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -23,10 +23,21 @@ from kitchen.text.converters import getwriter, to_unicode except ImportError: print "Please install the kitchen module." + +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +import xml.dom.minidom as MD + import cookielib import cPickle import datetime import sys +import io +import traceback + try: import argparse except ImportError: @@ -63,7 +74,7 @@ UTF8Writer = getwriter('utf8') sys.stdout = UTF8Writer(sys.stdout) -__VERSION__ = '0.4.0-alpha' # major, minor, micro: semver.org +__VERSION__ = '0.5.0-alpha' # major, minor, micro: semver.org class PageMissingError(Exception): def __init__(self, title, xml): @@ -164,7 +175,7 @@ def getNamespacesScraper(config={}, session=None): namespacenames = {0: ''} # main is 0, no prefix if namespaces: r = session.post( - url=config['index'], params={'title': 'Special:Allpages'}, timeout=30) + url=config['index'], params={'title': 'Special:Allpages'}, timeout=120) raw = r.text delay(config=config, session=session) @@ -206,7 +217,7 @@ def getNamespacesAPI(config={}, session=None): 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}, - timeout=30 + timeout=120 ) result = getJSON(r) delay(config=config, session=session) @@ -281,7 +292,7 @@ def getPageTitlesScraper(config={}, session=None): print ' Retrieving titles in the namespace', namespace url = '%s?title=Special:Allpages&namespace=%s' % ( config['index'], namespace) - r = session.get(url=url, timeout=30) + r = session.get(url=url, timeout=120) raw = r.text raw = cleanHTML(raw) @@ -455,7 +466,7 @@ def getXMLHeader(config={}, session=None): else: try: - xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)]) + xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)]) except PageMissingError as pme: # The does not exist. Not a problem, if we get the . xml = pme.xml @@ -477,7 +488,7 @@ def getXMLHeader(config={}, session=None): ) config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \ + ':Export' - xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)]) + xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)]) except PageMissingError as pme: xml = pme.xml except ExportAbortedError: @@ -500,7 +511,7 @@ def getXMLHeader(config={}, session=None): def getXMLFileDesc(config={}, title='', session=None): """ Get XML for image description page """ config['curonly'] = 1 # tricky to get only the most recent desc - return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)])) + return("".join([x for x in getXMLPage_( config=config, title=title, verbose=False, session=session)])) def getUserAgent(): @@ -521,7 +532,216 @@ def logerror(config={}, text=''): output = u'%s: %s\n' % ( datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text) outfile.write(output.encode('utf-8')) +def reconstructRevisions(root=None): + #print ET.tostring(rev) + page = ET.Element('stub') + edits = 0 + for rev in root.find('query').find('pages').find('page').find('revisions').findall('rev'): + try: + rev_ = ET.SubElement(page,'revision') + ET.SubElement(rev_,'id').text = rev.attrib['revid'] + ET.SubElement(rev_,'timestamp').text = rev.attrib['timestamp'] + contributor = ET.SubElement(rev_,'contributor') + if not rev.attrib.has_key('userhidden'): + ET.SubElement(contributor,'username').text = rev.attrib['user'] + ET.SubElement(contributor,'id').text = rev.attrib['userid'] + else: + contributor.set('deleted','deleted') + comment = ET.SubElement(rev_,'comment') + if not rev.attrib.has_key('commenthidden'): + comment.text = rev.attrib['comment'] + else: + comment.set('deleted','deleted') + + # some revision does not return model and format, so just use hard-code + ET.SubElement(rev_,'model').text = 'wikitext' + ET.SubElement(rev_,'format').text = 'text/x-wiki' + text = ET.SubElement(rev_,'text') + if not rev.attrib.has_key('texthidden'): + text.attrib['xml:space'] = "preserve" + text.attrib['bytes'] = rev.attrib['size'] + text.text = rev.text + else: + text.set('deleted','deleted') + # delete sha1 here :) + #sha1 = ET.SubElement(rev_,'sha1') + #if not rev.attrib.has_key('sha1missing'): + #sha1.text = rev.attrib['sha1'] + if rev.attrib.has_key('minor'): + ET.SubElement(rev_,'minor') + edits += 1 + except Exception as e: + #logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev))) + print ET.tostring(rev) + traceback.print_exc() + page = None + edits = 0 + raise e + return page,edits + +def getXMLPageCoreWithApi(headers={}, params={}, config={}, session=None): + """ """ + # just send the API request + # if it fails, it will reduce params['rvlimit'] + xml = '' + c = 0 + maxseconds = 100 # max seconds to wait in a single sleeping + maxretries = config['retries'] # x retries and skip + increment = 20 # increment every retry + while not re.search(r'' if not config['curonly'] else r'', xml) or re.search(r'', xml): + if c > 0 and c < maxretries: + wait = increment * c < maxseconds and increment * \ + c or maxseconds # incremental until maxseconds + print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['titles' if config['apiexport'] else 'pages'], wait) + time.sleep(wait) + # reducing server load requesting smallest chunks (if curonly then + # rvlimit = 1 from mother function) + if params['rvlimit'] > 1: + params['rvlimit'] = params['rvlimit'] / 2 # half + if c >= maxretries: + print ' We have retried %d times' % (c) + print ' MediaWiki error for "%s", network error or whatever...' % (params['titles' if config['apiexport'] else 'pages']) + # If it's not already what we tried: our last chance, preserve only the last revision... + # config['curonly'] means that the whole dump is configured to save only the last, + # params['curonly'] should mean that we've already tried this + # fallback, because it's set by the following if and passed to + # getXMLPageCore + # TODO: save only the last version when failed + print ' Saving in the errors log, and skipping...' + logerror( + config=config, + text=u'Error while retrieving the last revision of "%s". Skipping.' % + (params['titles' if config['apiexport'] else 'pages']).decode('utf-8')) + #raise ExportAbortedError(config['index']) + return '' # empty xml + + # FIXME HANDLE HTTP Errors HERE + try: + r = session.get(url=config['api'], params=params, headers=headers) + handleStatusCode(r) + xml = fixBOM(r) + #print xml + except requests.exceptions.ConnectionError as e: + print ' Connection error: %s'%(str(e[0])) + xml = '' + c += 1 + return xml +def getXMLPageWithApi(config={}, title='', verbose=True, session=None): + """ Get the full history (or current only) of a page using API:Query + if params['curonly'] is set, then using export&exportwrap to export + """ + + title_ = title + title_ = re.sub(' ', '_', title_) + # do not convert & into %26, title_ = re.sub('&', '%26', title_) + # action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE + # &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize + #print 'current:%s' % (title_) + if not config['curonly']: + params = {'titles': title_, 'action': 'query','format':'xml', + 'prop':'revisions', + 'rvprop' : 'timestamp|user|comment|content|ids|userid|sha1|size|flags', + 'rvcontinue' : None, + 'rvlimit' : 10 # TODO: set this by commandline + } + else: + params = {'titles': title_, 'action': 'query','format':'xml','export':1,'exportnowrap':1} + #print 'params:%s' % (params) + if not config['curonly']: + firstpartok = False + lastcontinue = None + numberofedits = 0 + ret = '' + while True: + # in case the last request is not right, saving last time's progress + if not firstpartok: + try: + lastcontinue = params['rvcontinue'] + except: + lastcontinue = None + + xml = getXMLPageCoreWithApi(params=params, config=config, session=session) + if xml == "": + #just return so that we can continue, and getXMLPageCoreWithApi will log the error + return + try: + root = ET.fromstring(xml.encode('utf-8')) + except: + continue + try: + retpage = root.find('query').find('pages').find('page') + except: + continue + if retpage.attrib.has_key('missing') or retpage.attrib.has_key('invalid'): + print 'Page not found' + raise PageMissingError(params['titles'], xml) + if not firstpartok: + try: + # build the firstpart by ourselves to improve the memory usage + ret = ' \n' + ret += ' %s\n' %(retpage.attrib['title']) + ret += ' %s\n' % (retpage.attrib['ns']) + ret += ' %s\n' % (retpage.attrib['pageid']) + except: + firstpartok = False + continue + else: + firstpartok = True + yield ret + try: + ret = '' + edits = 0 + if config['curonly'] or root.find('continue') == None: + # transform the revision + rev_,edits = reconstructRevisions(root=root) + xmldom = MD.parseString(''+ET.tostring(rev_)+'') + # convert it into text in case it throws MemoryError + # delete the first three line and last two line,which is for setting the indent + ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2]) + yield ret + numberofedits += edits + break + else: + rev_,edits = reconstructRevisions(root=root) + xmldom = MD.parseString('' + ET.tostring(rev_) + '') + ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2]) + params['rvcontinue'] = root.find('continue').attrib['rvcontinue'] + numberofedits += edits + yield ret + except: + traceback.print_exc() + params['rvcontinue'] = lastcontinue + ret = '' + yield ' \n' + else: + xml = getXMLPageCoreWithApi(params=params, config=config, session=session) + if xml == "": + raise ExportAbortedError(config['index']) + if not "" in xml: + raise PageMissingError(params['titles'], xml) + else: + # strip these sha1s sums which keep showing up in the export and + # which are invalid for the XML schema (they only apply to + # revisions) + xml = re.sub(r'\n\s*\w+\s*\n', r'\n', xml) + xml = re.sub(r'\n\s*\s*\n', r'\n', xml) + + yield xml.split("")[0] + + # just for looking good :) + r_timestamp = r'([^<]+)' + + numberofedits = 0 + numberofedits += len(re.findall(r_timestamp, xml)) + + yield "\n" + + if verbose: + if (numberofedits == 1): + print ' %s, 1 edit' % (title.strip()) + else: + print ' %s, %d edits' % (title.strip(), numberofedits) def getXMLPageCore(headers={}, params={}, config={}, session=None): """ """ @@ -694,7 +914,13 @@ def getXMLPage(config={}, title='', verbose=True, session=None): print ' %s, 1 edit' % (title.strip()) else: print ' %s, %d edits' % (title.strip(), numberofedits) - +def getXMLPage_(config={}, title='', verbose=True, session=None): + #print config + if config['apiexport']: + return getXMLPageWithApi(config=config, title=title, verbose=verbose, session=session) + else: + return getXMLPage(config=config, title=title, verbose=verbose, session=session) + return '' def makeXmlPageFromRaw(xml): """ Discard the metadata around a element in string""" @@ -775,7 +1001,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): if c % 10 == 0: print 'Downloaded %d pages' % (c) try: - for xml in getXMLPage(config=config, title=title, session=session): + for xml in getXMLPage_(config=config, title=title, session=session): xml = cleanXML(xml=xml) xmlfile.write(xml.encode('utf-8')) except PageMissingError: @@ -1680,6 +1906,7 @@ def getParameters(params=[]): action='store_true', help='resumes previous incomplete dump (requires --path)') parser.add_argument('--force', action='store_true', help='') + parser.add_argument('--ignore-api-check', action='store_true', help='') parser.add_argument( '--user', help='Username if authentication is required.') parser.add_argument( @@ -1723,6 +1950,10 @@ def getParameters(params=[]): '--exnamespaces', metavar="1,2,3", help='comma-separated value of namespaces to exclude') + groupDownload.add_argument( + '--apiexport', + action='store_true', + help="Using API instead of Special:Export to export pages") # Meta info params groupMeta = parser.add_argument_group( @@ -1824,6 +2055,8 @@ def getParameters(params=[]): index2 = check[1] api = checkedapi print 'API is OK: ' + checkedapi + elif args.ignore_api_check: + print 'Error in API. Ignoring.' else: if index and not args.wiki: print 'API not available. Trying with index.php only.' @@ -1921,6 +2154,7 @@ def getParameters(params=[]): 'cookies': args.cookies or '', 'delay': args.delay, 'retries': int(args.retries), + 'apiexport': args.apiexport } other = { From 120f54d1132589b9b65c0f3a44631b19a1960433 Mon Sep 17 00:00:00 2001 From: GERZAC1002 Date: Mon, 11 Apr 2022 23:22:44 +0200 Subject: [PATCH 2/3] Improvements over previous commit: -renamed --apiexport to '--apiquery' and gave it a more useful help text -renamed function 'getXMLPage_' to 'selectXMLQuerMode' to make it distinct enough from 'getXMLPage' -added in checks so that '--apiquery' can only be used together with '--curonly' -dialed back some timeouts from 120 to 60 which is still more than the previous 30 Improvements that could still be implemented: -Checking the availability of Special:Export before trying to download the list of titles and images wasting resources --- dumpgenerator.py | 63 +++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index a2d5fabc..c22a30e9 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -74,7 +74,7 @@ UTF8Writer = getwriter('utf8') sys.stdout = UTF8Writer(sys.stdout) -__VERSION__ = '0.5.0-alpha' # major, minor, micro: semver.org +__VERSION__ = '0.4.1-alpha' # major, minor, micro: semver.org class PageMissingError(Exception): def __init__(self, title, xml): @@ -175,7 +175,7 @@ def getNamespacesScraper(config={}, session=None): namespacenames = {0: ''} # main is 0, no prefix if namespaces: r = session.post( - url=config['index'], params={'title': 'Special:Allpages'}, timeout=120) + url=config['index'], params={'title': 'Special:Allpages'}, timeout=60) raw = r.text delay(config=config, session=session) @@ -217,7 +217,7 @@ def getNamespacesAPI(config={}, session=None): 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}, - timeout=120 + timeout=60 ) result = getJSON(r) delay(config=config, session=session) @@ -292,7 +292,7 @@ def getPageTitlesScraper(config={}, session=None): print ' Retrieving titles in the namespace', namespace url = '%s?title=Special:Allpages&namespace=%s' % ( config['index'], namespace) - r = session.get(url=url, timeout=120) + r = session.get(url=url, timeout=60) raw = r.text raw = cleanHTML(raw) @@ -466,7 +466,7 @@ def getXMLHeader(config={}, session=None): else: try: - xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)]) + xml = "".join([x for x in selectXMLQueryMode(config=config, title=randomtitle, verbose=False, session=session)]) except PageMissingError as pme: # The does not exist. Not a problem, if we get the . xml = pme.xml @@ -484,11 +484,11 @@ def getXMLHeader(config={}, session=None): 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}, - timeout=120 + timeout=60 ) config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \ + ':Export' - xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)]) + xml = "".join([x for x in selectXMLQueryMode(config=config, title=randomtitle, verbose=False, session=session)]) except PageMissingError as pme: xml = pme.xml except ExportAbortedError: @@ -511,7 +511,7 @@ def getXMLHeader(config={}, session=None): def getXMLFileDesc(config={}, title='', session=None): """ Get XML for image description page """ config['curonly'] = 1 # tricky to get only the most recent desc - return("".join([x for x in getXMLPage_( config=config, title=title, verbose=False, session=session)])) + return("".join([x for x in selectXMLQueryMode( config=config, title=title, verbose=False, session=session)])) def getUserAgent(): @@ -593,7 +593,7 @@ def getXMLPageCoreWithApi(headers={}, params={}, config={}, session=None): if c > 0 and c < maxretries: wait = increment * c < maxseconds and increment * \ c or maxseconds # incremental until maxseconds - print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['titles' if config['apiexport'] else 'pages'], wait) + print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['titles' if config['apiquery'] else 'pages'], wait) time.sleep(wait) # reducing server load requesting smallest chunks (if curonly then # rvlimit = 1 from mother function) @@ -601,7 +601,7 @@ def getXMLPageCoreWithApi(headers={}, params={}, config={}, session=None): params['rvlimit'] = params['rvlimit'] / 2 # half if c >= maxretries: print ' We have retried %d times' % (c) - print ' MediaWiki error for "%s", network error or whatever...' % (params['titles' if config['apiexport'] else 'pages']) + print ' MediaWiki error for "%s", network error or whatever...' % (params['titles' if config['apiquery'] else 'pages']) # If it's not already what we tried: our last chance, preserve only the last revision... # config['curonly'] means that the whole dump is configured to save only the last, # params['curonly'] should mean that we've already tried this @@ -612,7 +612,7 @@ def getXMLPageCoreWithApi(headers={}, params={}, config={}, session=None): logerror( config=config, text=u'Error while retrieving the last revision of "%s". Skipping.' % - (params['titles' if config['apiexport'] else 'pages']).decode('utf-8')) + (params['titles' if config['apiquery'] else 'pages']).decode('utf-8')) #raise ExportAbortedError(config['index']) return '' # empty xml @@ -660,7 +660,7 @@ def getXMLPageWithApi(config={}, title='', verbose=True, session=None): lastcontinue = params['rvcontinue'] except: lastcontinue = None - + xml = getXMLPageCoreWithApi(params=params, config=config, session=session) if xml == "": #just return so that we can continue, and getXMLPageCoreWithApi will log the error @@ -807,7 +807,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None): xml = '' except requests.exceptions.ReadTimeout as e: print ' Read timeout: %s'%(str(e[0])) - xml = '' + xml = '' c += 1 return xml @@ -914,11 +914,12 @@ def getXMLPage(config={}, title='', verbose=True, session=None): print ' %s, 1 edit' % (title.strip()) else: print ' %s, %d edits' % (title.strip(), numberofedits) -def getXMLPage_(config={}, title='', verbose=True, session=None): - #print config - if config['apiexport']: +def selectXMLQueryMode(config={}, title='', verbose=True, session=None): + if config['apiquery']: + #Using api.php?Query instead of relying on Special:Export return getXMLPageWithApi(config=config, title=title, verbose=verbose, session=session) else: + #Using the traditional method(default) return getXMLPage(config=config, title=title, verbose=verbose, session=session) return '' @@ -1001,7 +1002,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): if c % 10 == 0: print 'Downloaded %d pages' % (c) try: - for xml in getXMLPage_(config=config, title=title, session=session): + for xml in selectXMLQueryMode(config=config, title=title, session=session): xml = cleanXML(xml=xml) xmlfile.write(xml.encode('utf-8')) except PageMissingError: @@ -1122,7 +1123,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None): # repeated header is confusing and would not even be valid xml = exportrequest['query']['export']['*'] yield makeXmlPageFromRaw(xml) - + if 'continue' in arvrequest: # Get the new ones arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue'] @@ -1144,7 +1145,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None): else: # End of continuation. We are done with this namespace. break - + except (KeyError, mwclient.errors.InvalidResponse) as e: print(e) # TODO: check whether the KeyError was really for a missing arv API @@ -1376,7 +1377,7 @@ def reverse_readline(filename, buf_size=8192, truncate=False): if segment is not None: # if the previous chunk starts right from the beginning of line # do not concat the segment to the last line of new chunk - # instead, yield the segment first + # instead, yield the segment first if buffer[-1] is not '\n': lines[-1] += segment else: @@ -1938,6 +1939,10 @@ def getParameters(params=[]): help="generates a full history XML dump (--xml --curonly for current revisions only)") groupDownload.add_argument('--curonly', action='store_true', help='store only the current version of pages') + groupDownload.add_argument( + '--apiquery', + action='store_true', + help="EXPERIMENTAL: Using api.php?query instead of Special:Export to export pages, only use with --curonly") groupDownload.add_argument('--xmlrevisions', action='store_true', help='download all revisions from an API generator. MediaWiki 1.27+ only.') groupDownload.add_argument( @@ -1950,10 +1955,6 @@ def getParameters(params=[]): '--exnamespaces', metavar="1,2,3", help='comma-separated value of namespaces to exclude') - groupDownload.add_argument( - '--apiexport', - action='store_true', - help="Using API instead of Special:Export to export pages") # Meta info params groupMeta = parser.add_argument_group( @@ -1991,6 +1992,14 @@ def getParameters(params=[]): print getWikiEngine(url=args.wiki) sys.exit() + if (args.apiquery and not args.curonly) or (args.xmlrevisions and args.apiquery): + if (args.xmlrevisions): + print('ERROR: --apiquery conflicts with --xmlrevisions and requires --curonly') + sys.exit() + elif (args.xml): + print('ERROR: --apiquery conflicts requires --curonly') + sys.exit() + # Create session cj = cookielib.MozillaCookieJar() if args.cookies: @@ -2154,7 +2163,7 @@ def getParameters(params=[]): 'cookies': args.cookies or '', 'delay': args.delay, 'retries': int(args.retries), - 'apiexport': args.apiexport + 'apiquery': args.apiquery, } other = { @@ -2615,7 +2624,7 @@ def getWikiEngine(url=''): session.headers.update({'User-Agent': getUserAgent()}) r = session.post(url=url, timeout=30) if r.status_code == 405 or r.text == '': - r = session.get(url=url, timeout=120) + r = session.get(url=url, timeout=60) result = r.text wikiengine = 'Unknown' @@ -2698,7 +2707,7 @@ def mwGetAPIAndIndex(url=''): index = '' session = requests.Session() session.headers.update({'User-Agent': getUserAgent()}) - r = session.post(url=url, timeout=120) + r = session.post(url=url, timeout=60) result = r.text # API From 3b022354c77d1424d4258efab11c0827ee09e8f4 Mon Sep 17 00:00:00 2001 From: GERZAC1002 Date: Tue, 12 Apr 2022 00:37:39 +0200 Subject: [PATCH 3/3] Adjusted dumpgenerator.py to allow '--apiquery' to go along with '--images' and '--xmlrevisions' too, accidentally disabled that in previous commit --- dumpgenerator.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index c22a30e9..1ba83a52 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -1942,7 +1942,7 @@ def getParameters(params=[]): groupDownload.add_argument( '--apiquery', action='store_true', - help="EXPERIMENTAL: Using api.php?query instead of Special:Export to export pages, only use with --curonly") + help="EXPERIMENTAL: Using api.php?query instead of Special:Export to export pages, works with: --curonly,--xmlrevisions,--images") groupDownload.add_argument('--xmlrevisions', action='store_true', help='download all revisions from an API generator. MediaWiki 1.27+ only.') groupDownload.add_argument( @@ -1992,13 +1992,9 @@ def getParameters(params=[]): print getWikiEngine(url=args.wiki) sys.exit() - if (args.apiquery and not args.curonly) or (args.xmlrevisions and args.apiquery): - if (args.xmlrevisions): - print('ERROR: --apiquery conflicts with --xmlrevisions and requires --curonly') - sys.exit() - elif (args.xml): - print('ERROR: --apiquery conflicts requires --curonly') - sys.exit() + if (args.apiquery and not args.curonly) and (args.apiquery and not args.xmlrevisions) and (args.apiquery and not args.images): + print('ERROR: --apiquery requires either --curonly or --images or --xmlrevisions') + sys.exit() # Create session cj = cookielib.MozillaCookieJar()