Talk:AllPages: Difference between revisions
Jump to navigation
Jump to search
No edit summary |
Update list again |
||
| (19 intermediate revisions by 2 users not shown) | |||
| Line 1: | Line 1: | ||
== List generation == | |||
<pre> | |||
#!/usr/bin/env python3 | |||
< | |||
import pathlib | |||
import urllib.request | |||
import json | |||
::I wanted to make sure there wasn't anything I was missing. When I'm browsing through the list it's easier to click these links than copy pasting the auto-generated titles from the api query. | if not pathlib.Path('output').exists(): | ||
pathlib.Path('output').mkdir() | |||
all_pages_output_file = open('output/AllPages.mediawiki', 'w') | |||
all_pages_output_file.write('[[Category: Indices]]\n\n') | |||
category_output_files = {} | |||
base_url = 'http://heroes.thelazy.net/wiki/' | |||
page_titles_by_category = {} | |||
category_titles_by_category = {} | |||
def main(): | |||
base_page_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&list=allpages&aplimit=500&apfilterredir=nonredirects&format=json&apfrom=' | |||
continue_title = '' | |||
page_titles = {} | |||
while True: | |||
request = urllib.request.urlopen(base_page_query + continue_title.replace(' ', '_')) | |||
response = request.read() | |||
results = json.loads(response.decode()) | |||
for page in results['query']['allpages']: | |||
title = page['title'] | |||
page_titles[title] = base_url + title | |||
if 'query-continue' in results: | |||
continue_title = results['query-continue']['allpages']['apcontinue'] | |||
else: | |||
break | |||
page_title_queries = [] | |||
page_title_query = '' | |||
for title, url in page_titles.items(): | |||
if title != 'Main Page' and title != 'Search': | |||
page_title_query += title + '|' | |||
if len(page_title_query) > 50: | |||
page_title_queries.append(page_title_query.rstrip('|')) | |||
page_title_query = '' | |||
prop_page_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&prop=categories&format=json&titles=' | |||
for page_title_query in page_title_queries: | |||
request = urllib.request.urlopen(prop_page_query + page_title_query.replace(' ', '_')) | |||
response = request.read() | |||
results = json.loads(response.decode()) | |||
for _, result in results['query']['pages'].items(): | |||
if 'categories' not in result: | |||
print('\n\n') | |||
print(result['title'] + ' has no categories.') | |||
print('\n\n') | |||
continue | |||
progress = result['title'] + ' : ' | |||
for category in result['categories']: | |||
if category['title'] not in page_titles_by_category: | |||
page_titles_by_category[category['title']] = set() | |||
page_titles_by_category[category['title']].add(result['title']) | |||
progress += category['title'] + ', ' | |||
print(progress.rstrip(', ')) | |||
base_category_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&list=allcategories&aclimit=500&acprop=size&format=json&acfrom=' | |||
continue_category = '' | |||
category_titles = {} | |||
while True: | |||
request = urllib.request.urlopen(base_category_query + continue_category.replace(' ', '_')) | |||
response = request.read() | |||
results = json.loads(response.decode()) | |||
for category in results['query']['allcategories']: | |||
if category['size'] is 0 or category['size'] is '0': | |||
continue | |||
title = category['*'] | |||
category_titles[title] = base_url + title | |||
if 'query-continue' in results: | |||
continue_category = results['query-continue']['allcategories']['accontinue'] | |||
else: | |||
break | |||
category_title_queries = [] | |||
category_title_query = '' | |||
for title, url in category_titles.items(): | |||
category_title_query += 'Category:' + title + '|' | |||
if len(category_title_query) > 50: | |||
category_title_queries.append(category_title_query.rstrip('|')) | |||
category_title_query = '' | |||
prop_category_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&prop=categories&format=json&titles=' | |||
for category_title_query in category_title_queries: | |||
request = urllib.request.urlopen(prop_category_query + category_title_query.replace(' ', '_')) | |||
response = request.read() | |||
results = json.loads(response.decode()) | |||
for _, result in results['query']['pages'].items(): | |||
if 'categories' not in result: | |||
print('\n\n') | |||
print(result['title'] + ' has no categories.') | |||
print('\n\n') | |||
continue | |||
progress = result['title'] + ' : ' | |||
for category in result['categories']: | |||
if category['title'] not in category_titles_by_category: | |||
category_titles_by_category[category['title']] = set() | |||
category_titles_by_category[category['title']].add(result['title']) | |||
progress += category['title'] + ', ' | |||
print(progress.rstrip(', ')) | |||
print('\n\n') | |||
print(page_titles_by_category) | |||
print('\n\n') | |||
print(category_titles_by_category) | |||
for category_title in sorted(category_titles_by_category['Category:Content']): | |||
category_file_name = ''.join([i for i in category_title if i.isalpha()]) | |||
category_output_files[category_file_name] = open('output/' + category_file_name + '.mediawiki', 'w') | |||
category_output_files[category_file_name].write('[[Category: Indices]]\n\n') | |||
print_titles(category_file_name, category_title) | |||
def print_titles(category_file_name, category_title, category_level='=', indent_level=''): | |||
all_pages_output_file.write(category_level + ' <span class="plainlinks">[' + base_url + category_title.replace(' ', '_') + ' ' + indent_level + category_title.split('Category:', 1).pop() + ']</span> ' + category_level + '\n\n') | |||
category_output_files[category_file_name].write(category_level + ' <span class="plainlinks">[' + base_url + category_title.replace(' ', '_') + ' ' + indent_level + category_title.split('Category:', 1).pop() + ']</span> ' + category_level + '\n\n') | |||
category_level += '=' | |||
indent_level += ' ' | |||
if category_title in page_titles_by_category: | |||
for page_title in sorted(page_titles_by_category[category_title]): | |||
all_pages_output_file.write('<span class="plainlinks">[' + base_url + page_title.replace(' ', '_') + ' ' + indent_level + page_title + ']</span>\n\n') | |||
category_output_files[category_file_name].write('<span class="plainlinks">[' + base_url + page_title.replace(' ', '_') + ' ' + indent_level + page_title + ']</span>\n\n') | |||
if category_title in category_titles_by_category: | |||
for child_category_title in sorted(category_titles_by_category[category_title]): | |||
print_titles(category_file_name, child_category_title, category_level, indent_level) | |||
if __name__ == "__main__": | |||
main() | |||
</pre> | |||
== Discussion == | |||
Nice technical page, but I must ask is there any use for it? At least for me this seems quite useless, causing hinder rather than help. –[[User:Kapteeni Ruoska|Kapteeni Ruoska]] ([[User talk:Kapteeni Ruoska|talk]]) 06:11, 7 September 2016 (CEST) | |||
:I wanted to make sure there wasn't anything I was missing. When I'm browsing through the list it's easier to click these links than copy pasting the auto-generated titles from the api query. | |||
:--[[User:imahero|imahero]] 03:14, 8 September 2016 (CEST) | |||
::Sure, just wondering, as the wiki already has [[Special:AllPages]], but perhaps there is a use for that. –[[User:Kapteeni Ruoska|Kapteeni Ruoska]] ([[User talk:Kapteeni Ruoska|talk]]) 07:27, 8 September 2016 (CEST) | |||
:::[[Special:AllPages]] felt too clumsy for me to navigate. All I really wanted was a single list of all nonredirect links :) | |||
::: --[[User:imahero|imahero]] 08:31, 8 September 2016 (CEST) | |||
I may want to go back to using the external link syntax if breaking the [[Special:LonelyPages|orphaned pages]] is a big deal. For now I've just added the orphaned pages here and I'll just update the list every time I generate the AllPages list.<br> | |||
--[[User:imahero|imahero]] 15:19, 8 September 2016 (CEST) | |||
::I changed it back to use the external link syntax to make sure the [[Special:LonelyPages|orphaned pages]] list still works properly. <br> | |||
:: --[[User:imahero|imahero]] 22:48, 8 September 2016 (CEST) | |||
It's now sorted by category. <br> | |||
--[[User:imahero|imahero]] 12:26, 10 September 2016 (CEST) | |||
Revision as of 11:15, 17 September 2016
List generation
#!/usr/bin/env python3
import pathlib
import urllib.request
import json
if not pathlib.Path('output').exists():
pathlib.Path('output').mkdir()
all_pages_output_file = open('output/AllPages.mediawiki', 'w')
all_pages_output_file.write('[[Category: Indices]]\n\n')
category_output_files = {}
base_url = 'http://heroes.thelazy.net/wiki/'
page_titles_by_category = {}
category_titles_by_category = {}
def main():
base_page_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&list=allpages&aplimit=500&apfilterredir=nonredirects&format=json&apfrom='
continue_title = ''
page_titles = {}
while True:
request = urllib.request.urlopen(base_page_query + continue_title.replace(' ', '_'))
response = request.read()
results = json.loads(response.decode())
for page in results['query']['allpages']:
title = page['title']
page_titles[title] = base_url + title
if 'query-continue' in results:
continue_title = results['query-continue']['allpages']['apcontinue']
else:
break
page_title_queries = []
page_title_query = ''
for title, url in page_titles.items():
if title != 'Main Page' and title != 'Search':
page_title_query += title + '|'
if len(page_title_query) > 50:
page_title_queries.append(page_title_query.rstrip('|'))
page_title_query = ''
prop_page_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&prop=categories&format=json&titles='
for page_title_query in page_title_queries:
request = urllib.request.urlopen(prop_page_query + page_title_query.replace(' ', '_'))
response = request.read()
results = json.loads(response.decode())
for _, result in results['query']['pages'].items():
if 'categories' not in result:
print('\n\n')
print(result['title'] + ' has no categories.')
print('\n\n')
continue
progress = result['title'] + ' : '
for category in result['categories']:
if category['title'] not in page_titles_by_category:
page_titles_by_category[category['title']] = set()
page_titles_by_category[category['title']].add(result['title'])
progress += category['title'] + ', '
print(progress.rstrip(', '))
base_category_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&list=allcategories&aclimit=500&acprop=size&format=json&acfrom='
continue_category = ''
category_titles = {}
while True:
request = urllib.request.urlopen(base_category_query + continue_category.replace(' ', '_'))
response = request.read()
results = json.loads(response.decode())
for category in results['query']['allcategories']:
if category['size'] is 0 or category['size'] is '0':
continue
title = category['*']
category_titles[title] = base_url + title
if 'query-continue' in results:
continue_category = results['query-continue']['allcategories']['accontinue']
else:
break
category_title_queries = []
category_title_query = ''
for title, url in category_titles.items():
category_title_query += 'Category:' + title + '|'
if len(category_title_query) > 50:
category_title_queries.append(category_title_query.rstrip('|'))
category_title_query = ''
prop_category_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&prop=categories&format=json&titles='
for category_title_query in category_title_queries:
request = urllib.request.urlopen(prop_category_query + category_title_query.replace(' ', '_'))
response = request.read()
results = json.loads(response.decode())
for _, result in results['query']['pages'].items():
if 'categories' not in result:
print('\n\n')
print(result['title'] + ' has no categories.')
print('\n\n')
continue
progress = result['title'] + ' : '
for category in result['categories']:
if category['title'] not in category_titles_by_category:
category_titles_by_category[category['title']] = set()
category_titles_by_category[category['title']].add(result['title'])
progress += category['title'] + ', '
print(progress.rstrip(', '))
print('\n\n')
print(page_titles_by_category)
print('\n\n')
print(category_titles_by_category)
for category_title in sorted(category_titles_by_category['Category:Content']):
category_file_name = ''.join([i for i in category_title if i.isalpha()])
category_output_files[category_file_name] = open('output/' + category_file_name + '.mediawiki', 'w')
category_output_files[category_file_name].write('[[Category: Indices]]\n\n')
print_titles(category_file_name, category_title)
def print_titles(category_file_name, category_title, category_level='=', indent_level=''):
all_pages_output_file.write(category_level + ' <span class="plainlinks">[' + base_url + category_title.replace(' ', '_') + ' ' + indent_level + category_title.split('Category:', 1).pop() + ']</span> ' + category_level + '\n\n')
category_output_files[category_file_name].write(category_level + ' <span class="plainlinks">[' + base_url + category_title.replace(' ', '_') + ' ' + indent_level + category_title.split('Category:', 1).pop() + ']</span> ' + category_level + '\n\n')
category_level += '='
indent_level += ' '
if category_title in page_titles_by_category:
for page_title in sorted(page_titles_by_category[category_title]):
all_pages_output_file.write('<span class="plainlinks">[' + base_url + page_title.replace(' ', '_') + ' ' + indent_level + page_title + ']</span>\n\n')
category_output_files[category_file_name].write('<span class="plainlinks">[' + base_url + page_title.replace(' ', '_') + ' ' + indent_level + page_title + ']</span>\n\n')
if category_title in category_titles_by_category:
for child_category_title in sorted(category_titles_by_category[category_title]):
print_titles(category_file_name, child_category_title, category_level, indent_level)
if __name__ == "__main__":
main()
Discussion
Nice technical page, but I must ask is there any use for it? At least for me this seems quite useless, causing hinder rather than help. –Kapteeni Ruoska (talk) 06:11, 7 September 2016 (CEST)
- I wanted to make sure there wasn't anything I was missing. When I'm browsing through the list it's easier to click these links than copy pasting the auto-generated titles from the api query.
- --imahero 03:14, 8 September 2016 (CEST)
- Sure, just wondering, as the wiki already has Special:AllPages, but perhaps there is a use for that. –Kapteeni Ruoska (talk) 07:27, 8 September 2016 (CEST)
- Special:AllPages felt too clumsy for me to navigate. All I really wanted was a single list of all nonredirect links :)
- --imahero 08:31, 8 September 2016 (CEST)
I may want to go back to using the external link syntax if breaking the orphaned pages is a big deal. For now I've just added the orphaned pages here and I'll just update the list every time I generate the AllPages list.
--imahero 15:19, 8 September 2016 (CEST)
- I changed it back to use the external link syntax to make sure the orphaned pages list still works properly.
- --imahero 22:48, 8 September 2016 (CEST)
- I changed it back to use the external link syntax to make sure the orphaned pages list still works properly.
It's now sorted by category.
--imahero 12:26, 10 September 2016 (CEST)