1 Star 0 Fork 0

lovelacelee / cppreference-doc

Create your Gitee Account
Explore and code with more than 6 million developers,Free private repositories !:)
Sign up
Clone or download
export.py 2.96 KB
Copy Edit Web IDE Raw Blame History
#!/usr/bin/env python3
'''
Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt>
This file is part of cppreference-doc
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/.
'''
import argparse
import urllib.parse
import urllib.request
import json
def retrieve_page_names(root, ns_index):
begin = None
pages = []
while True:
params = {
'action': 'query',
'list': 'allpages',
'apnamespace': ns_index,
'aplimit': 500,
'format': 'json'
}
if begin is not None:
params['apcontinue'] = begin
url = "{0}/api.php?{1}".format(root, urllib.parse.urlencode(params))
with urllib.request.urlopen(url) as f:
data = json.loads(f.read().decode('utf-8'))
pages += [p['title'] for p in data['query']['allpages']]
if ('query-continue' in data and
'allpages' in data['query-continue'] and
'apcontinue' in data['query-continue']['allpages']):
begin = data['query-continue']['allpages']['apcontinue']
else:
return pages
def export_pages(root, pages, output_path):
params = {
'wpDownload': '',
'curonly': 1,
'pages': '\n'.join(pages)
}
data = urllib.parse.urlencode(params)
data = data.encode('ascii')
url = "{0}/index.php?title=Special:Export&action=submit".format(root)
urllib.request.urlretrieve(url, output_path, data=data)
def main():
parser = argparse.ArgumentParser(prog='export.py')
parser.add_argument('--url', type=str,
help='The URL to the root of the MediaWiki '
'installation')
parser.add_argument('output_path', type=str,
help='The path to the XML file to save output to')
parser.add_argument('ns_index', type=str, nargs='+',
help='The indices of the namespaces to retrieve')
args = parser.parse_args()
pages = []
for ns_index in args.ns_index:
new_pages = retrieve_page_names(args.url, ns_index)
print("Retrieved {0} pages for namespace {1}".format(len(new_pages),
ns_index))
pages += new_pages
pages = sorted(pages)
export_pages(args.url, pages, args.output_path)
if __name__ == "__main__":
main()

Comment ( 0 )

Sign in for post a comment

1
https://gitee.com/lovelacelee/cppreference-doc.git
git@gitee.com:lovelacelee/cppreference-doc.git
lovelacelee
cppreference-doc
cppreference-doc
master

Search