Talk:AllPages: Difference between revisions
Jump to navigation
Jump to search
(Update python script) |
No edit summary |
||
Line 1: | Line 1: | ||
== List generation == | |||
<pre> | |||
#!/usr/bin/env python3 | #!/usr/bin/env python3 | ||
Line 136: | Line 138: | ||
if __name__ == "__main__": | if __name__ == "__main__": | ||
main() | main() | ||
</pre> | |||
== Discussion == | == Discussion == |
Revision as of 10:56, 17 September 2016
List generation
#!/usr/bin/env python3 import pathlib import urllib.request import json if not pathlib.Path('output').exists(): pathlib.Path('output').mkdir() all_pages_output_file = open('output/AllPages.mediawiki', 'w') category_output_files = {} base_url = 'http://heroes.thelazy.net/wiki/' page_titles_by_category = {} category_titles_by_category = {} def main(): base_page_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&list=allpages&aplimit=500&apfilterredir=nonredirects&format=json&apfrom=' continue_title = '' page_titles = {} while True: request = urllib.request.urlopen(base_page_query + continue_title.replace(' ', '_')) response = request.read() results = json.loads(response.decode()) for page in results['query']['allpages']: title = page['title'] page_titles[title] = base_url + title if 'query-continue' in results: continue_title = results['query-continue']['allpages']['apcontinue'] else: break page_title_queries = [] page_title_query = '' for title, url in page_titles.items(): if title != 'Main Page' and title != 'Search': page_title_query += title + '|' if len(page_title_query) > 50: page_title_queries.append(page_title_query.rstrip('|')) page_title_query = '' prop_page_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&prop=categories&format=json&titles=' for page_title_query in page_title_queries: request = urllib.request.urlopen(prop_page_query + page_title_query.replace(' ', '_')) response = request.read() results = json.loads(response.decode()) for _, result in results['query']['pages'].items(): if 'categories' not in result: print('\n\n') print(result['title'] + ' has no categories.') print('\n\n') continue progress = result['title'] + ' : ' for category in result['categories']: if category['title'] not in page_titles_by_category: page_titles_by_category[category['title']] = set() page_titles_by_category[category['title']].add(result['title']) progress += category['title'] + ', ' print(progress.rstrip(', ')) base_category_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&list=allcategories&aclimit=500&acprop=size&format=json&acfrom=' continue_category = '' category_titles = {} while True: request = urllib.request.urlopen(base_category_query + continue_category.replace(' ', '_')) response = request.read() results = json.loads(response.decode()) for category in results['query']['allcategories']: if category['size'] is 0 or category['size'] is '0': continue title = category['*'] category_titles[title] = base_url + title if 'query-continue' in results: continue_category = results['query-continue']['allcategories']['accontinue'] else: break category_title_queries = [] category_title_query = '' for title, url in category_titles.items(): category_title_query += 'Category:' + title + '|' if len(category_title_query) > 50: category_title_queries.append(category_title_query.rstrip('|')) category_title_query = '' prop_category_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&prop=categories&format=json&titles=' for category_title_query in category_title_queries: request = urllib.request.urlopen(prop_category_query + category_title_query.replace(' ', '_')) response = request.read() results = json.loads(response.decode()) for _, result in results['query']['pages'].items(): if 'categories' not in result: print('\n\n') print(result['title'] + ' has no categories.') print('\n\n') continue progress = result['title'] + ' : ' for category in result['categories']: if category['title'] not in category_titles_by_category: category_titles_by_category[category['title']] = set() category_titles_by_category[category['title']].add(result['title']) progress += category['title'] + ', ' print(progress.rstrip(', ')) print('\n\n') print(page_titles_by_category) print('\n\n') print(category_titles_by_category) for category_title in sorted(category_titles_by_category['Category:Content']): category_file_name = ''.join([i for i in category_title if i.isalpha()]) category_output_files[category_file_name] = open('output/' + category_file_name + '.mediawiki', 'w') print_titles(category_file_name, category_title) def print_titles(category_file_name, category_title, category_level='=', indent_level=''): all_pages_output_file.write(category_level + ' <span class="plainlinks">[' + base_url + category_title.replace(' ', '_') + ' ' + indent_level + category_title.split('Category:', 1).pop() + ']</span> ' + category_level + '\n\n') category_output_files[category_file_name].write(category_level + ' <span class="plainlinks">[' + base_url + category_title.replace(' ', '_') + ' ' + indent_level + category_title.split('Category:', 1).pop() + ']</span> ' + category_level + '\n\n') category_level += '=' indent_level += ' ' if category_title in page_titles_by_category: for page_title in sorted(page_titles_by_category[category_title]): all_pages_output_file.write('<span class="plainlinks">[' + base_url + page_title.replace(' ', '_') + ' ' + indent_level + page_title + ']</span>\n\n') category_output_files[category_file_name].write('<span class="plainlinks">[' + base_url + page_title.replace(' ', '_') + ' ' + indent_level + page_title + ']</span>\n\n') if category_title in category_titles_by_category: for child_category_title in sorted(category_titles_by_category[category_title]): print_titles(category_file_name, child_category_title, category_level, indent_level) if __name__ == "__main__": main()
Discussion
Nice technical page, but I must ask is there any use for it? At least for me this seems quite useless, causing hinder rather than help. –Kapteeni Ruoska (talk) 06:11, 7 September 2016 (CEST)
- I wanted to make sure there wasn't anything I was missing. When I'm browsing through the list it's easier to click these links than copy pasting the auto-generated titles from the api query.
- --imahero 03:14, 8 September 2016 (CEST)
- Sure, just wondering, as the wiki already has Special:AllPages, but perhaps there is a use for that. –Kapteeni Ruoska (talk) 07:27, 8 September 2016 (CEST)
- Special:AllPages felt too clumsy for me to navigate. All I really wanted was a single list of all nonredirect links :)
- --imahero 08:31, 8 September 2016 (CEST)
I may want to go back to using the external link syntax if breaking the orphaned pages is a big deal. For now I've just added the orphaned pages here and I'll just update the list every time I generate the AllPages list.
--imahero 15:19, 8 September 2016 (CEST)
- I changed it back to use the external link syntax to make sure the orphaned pages list still works properly.
- --imahero 22:48, 8 September 2016 (CEST)
- I changed it back to use the external link syntax to make sure the orphaned pages list still works properly.
It's now sorted by category.
--imahero 12:26, 10 September 2016 (CEST)