Talk:AllPages: Difference between revisions

From Heroes 3 wiki
Jump to navigation Jump to search
(Update python script)
Line 1: Line 1:
== List generation ==
<pre>
#!/usr/bin/env python3
#!/usr/bin/env python3


import pathlib
import urllib.request
import urllib.request
import json
import json
if not pathlib.Path('output').exists():
    pathlib.Path('output').mkdir()
all_pages_output_file = open('output/AllPages.mediawiki', 'w')
category_output_files = {}


base_url = 'http://heroes.thelazy.net/wiki/'
base_url = 'http://heroes.thelazy.net/wiki/'
base_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&list=allpages&aplimit=500&apfilterredir=nonredirects&format=json&apfrom='
page_titles_by_category = {}
continue_title = ''
category_titles_by_category = {}
titles = {}
 
def main():
    base_page_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&list=allpages&aplimit=500&apfilterredir=nonredirects&format=json&apfrom='
    continue_title = ''
    page_titles = {}
 
    while True:
        request = urllib.request.urlopen(base_page_query + continue_title.replace(' ', '_'))
        response = request.read()
        results = json.loads(response.decode())
        for page in results['query']['allpages']:
            title = page['title']
            page_titles[title] = base_url + title
        if 'query-continue' in results:
            continue_title = results['query-continue']['allpages']['apcontinue']
        else:
            break
 
    page_title_queries = []
    page_title_query = ''
 
    for title, url in page_titles.items():
        if title != 'Main Page' and title != 'Search':
            page_title_query += title + '|'
            if len(page_title_query) > 50:
                page_title_queries.append(page_title_query.rstrip('|'))
                page_title_query = ''
 
    prop_page_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&prop=categories&format=json&titles='
 
    for page_title_query in page_title_queries:
        request = urllib.request.urlopen(prop_page_query + page_title_query.replace(' ', '_'))
        response = request.read()
        results = json.loads(response.decode())
        for _, result in results['query']['pages'].items():
            if 'categories' not in result:
                print('\n\n')
                print(result['title'] + ' has no categories.')
                print('\n\n')
                continue
            progress = result['title'] + ' : '
            for category in result['categories']:
                if category['title'] not in page_titles_by_category:
                    page_titles_by_category[category['title']] = set()
                page_titles_by_category[category['title']].add(result['title'])
                progress += category['title'] + ', '
            print(progress.rstrip(', '))
 
    base_category_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&list=allcategories&aclimit=500&acprop=size&format=json&acfrom='
    continue_category = ''
    category_titles = {}
 
    while True:
        request = urllib.request.urlopen(base_category_query + continue_category.replace(' ', '_'))
        response = request.read()
        results = json.loads(response.decode())
        for category in results['query']['allcategories']:
            if category['size'] is 0 or category['size'] is '0':
                continue
            title = category['*']
            category_titles[title] = base_url + title
        if 'query-continue' in results:
            continue_category = results['query-continue']['allcategories']['accontinue']
        else:
            break
 
    category_title_queries = []
    category_title_query = ''


while True:
     for title, url in category_titles.items():
     request = urllib.request.urlopen(base_query + urllib.parse.quote(continue_title))
        category_title_query += 'Category:' + title + '|'
    response = request.read()
         if len(category_title_query) > 50:
    results = json.loads(response.decode())
            category_title_queries.append(category_title_query.rstrip('|'))
    for page in results['query']['allpages']:
            category_title_query = ''
        title = page['title']
         titles[title] = base_url + title
    if 'query-continue' in results:
        continue_title = results['query-continue']['allpages']['apcontinue']
    else:
        break


title_queries = []
    prop_category_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&prop=categories&format=json&titles='
title_query = ''


for title, url in titles.items():
    for category_title_query in category_title_queries:
    if title != 'All Pages' and title != 'Main Page' and title != 'Search':
        request = urllib.request.urlopen(prop_category_query + category_title_query.replace(' ', '_'))
        title_query += title + '|'
        response = request.read()
        if len(title_query) > 50:
        results = json.loads(response.decode())
             title_queries.append(title_query.rstrip('|'))
        for _, result in results['query']['pages'].items():
            title_query = ''
            if 'categories' not in result:
                print('\n\n')
                print(result['title'] + ' has no categories.')
                print('\n\n')
                continue
            progress = result['title'] + ' : '
            for category in result['categories']:
                if category['title'] not in category_titles_by_category:
                    category_titles_by_category[category['title']] = set()
                category_titles_by_category[category['title']].add(result['title'])
                progress += category['title'] + ', '
             print(progress.rstrip(', '))


prop_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&prop=categories&format=json&titles='
    print('\n\n')
titles_by_category = {}
    print(page_titles_by_category)
    print('\n\n')
    print(category_titles_by_category)


for title_query in title_queries:
    for category_title in sorted(category_titles_by_category['Category:Content']):
    request = urllib.request.urlopen(prop_query + urllib.parse.quote(title_query))
         category_file_name = ''.join([i for i in category_title if i.isalpha()])
    response = request.read()
         category_output_files[category_file_name] = open('output/' + category_file_name + '.mediawiki', 'w')
    results = json.loads(response.decode())
        print_titles(category_file_name, category_title)
    for _, page in results['query']['pages'].items():
         if 'categories' not in page:
            print(page)
            continue
         for category in page['categories']:
            if category['title'] not in titles_by_category:
                titles_by_category[category['title']] = set()
            titles_by_category[category['title']].add(page['title'])


output_file = open('output.mediawiki', 'w')
def print_titles(category_file_name, category_title, category_level='=', indent_level=''):
    all_pages_output_file.write(category_level + ' <span class="plainlinks">[' + base_url + category_title.replace(' ', '_') + ' ' + indent_level + category_title.split('Category:', 1).pop() + ']</span> ' + category_level + '\n\n')
    category_output_files[category_file_name].write(category_level + ' <span class="plainlinks">[' + base_url + category_title.replace(' ', '_') + ' ' + indent_level + category_title.split('Category:', 1).pop() + ']</span> ' + category_level + '\n\n')
    category_level += '='
    indent_level += '&nbsp;&nbsp;&nbsp;&nbsp;'
    if category_title in page_titles_by_category:
        for page_title in sorted(page_titles_by_category[category_title]):
            all_pages_output_file.write('<span class="plainlinks">[' + base_url + page_title.replace(' ', '_') + ' ' + indent_level + page_title + ']</span>\n\n')
            category_output_files[category_file_name].write('<span class="plainlinks">[' + base_url + page_title.replace(' ', '_') + ' ' + indent_level + page_title + ']</span>\n\n')
    if category_title in category_titles_by_category:
        for child_category_title in sorted(category_titles_by_category[category_title]):
            print_titles(category_file_name, child_category_title, category_level, indent_level)


for category, titles in sorted(titles_by_category.items()):
if __name__ == "__main__":
     output_file.write('[' + base_url + category.replace(' ', '_') + ' ' + category.split('Category:', 1).pop() + ']\n\n')
     main()
    for title in sorted(titles):
        output_file.write(':[' + base_url + title.replace(' ', '_') + ' ' + title + ']\n\n')
</pre>


== Discussion ==
== Discussion ==

Revision as of 10:55, 17 September 2016

  1. !/usr/bin/env python3

import pathlib import urllib.request import json


if not pathlib.Path('output').exists():

   pathlib.Path('output').mkdir()

all_pages_output_file = open('output/AllPages.mediawiki', 'w')

category_output_files = {}

base_url = 'http://heroes.thelazy.net/wiki/' page_titles_by_category = {} category_titles_by_category = {}

def main():

   base_page_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&list=allpages&aplimit=500&apfilterredir=nonredirects&format=json&apfrom='
   continue_title = 
   page_titles = {}
   while True:
       request = urllib.request.urlopen(base_page_query + continue_title.replace(' ', '_'))
       response = request.read()
       results = json.loads(response.decode())
       for page in results['query']['allpages']:
           title = page['title']
           page_titles[title] = base_url + title
       if 'query-continue' in results:
           continue_title = results['query-continue']['allpages']['apcontinue']
       else:
           break
   page_title_queries = []
   page_title_query = 
   for title, url in page_titles.items():
       if title != 'Main Page' and title != 'Search':
           page_title_query += title + '|'
           if len(page_title_query) > 50:
               page_title_queries.append(page_title_query.rstrip('|'))
               page_title_query = 
   prop_page_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&prop=categories&format=json&titles='
   for page_title_query in page_title_queries:
       request = urllib.request.urlopen(prop_page_query + page_title_query.replace(' ', '_'))
       response = request.read()
       results = json.loads(response.decode())
       for _, result in results['query']['pages'].items():
           if 'categories' not in result:
               print('\n\n')
               print(result['title'] + ' has no categories.')
               print('\n\n')
               continue
           progress = result['title'] + ' : '
           for category in result['categories']:
               if category['title'] not in page_titles_by_category:
                   page_titles_by_category[category['title']] = set()
               page_titles_by_category[category['title']].add(result['title'])
               progress += category['title'] + ', '
           print(progress.rstrip(', '))
   base_category_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&list=allcategories&aclimit=500&acprop=size&format=json&acfrom='
   continue_category = 
   category_titles = {}
   while True:
       request = urllib.request.urlopen(base_category_query + continue_category.replace(' ', '_'))
       response = request.read()
       results = json.loads(response.decode())
       for category in results['query']['allcategories']:
           if category['size'] is 0 or category['size'] is '0':
               continue
           title = category['*']
           category_titles[title] = base_url + title
       if 'query-continue' in results:
           continue_category = results['query-continue']['allcategories']['accontinue']
       else:
           break
   category_title_queries = []
   category_title_query = 
   for title, url in category_titles.items():
       category_title_query += 'Category:' + title + '|'
       if len(category_title_query) > 50:
           category_title_queries.append(category_title_query.rstrip('|'))
           category_title_query = 
   prop_category_query = 'http://heroes.thelazy.net/wiki/api.php?action=query&prop=categories&format=json&titles='
   for category_title_query in category_title_queries:
       request = urllib.request.urlopen(prop_category_query + category_title_query.replace(' ', '_'))
       response = request.read()
       results = json.loads(response.decode())
       for _, result in results['query']['pages'].items():
           if 'categories' not in result:
               print('\n\n')
               print(result['title'] + ' has no categories.')
               print('\n\n')
               continue
           progress = result['title'] + ' : '
           for category in result['categories']:
               if category['title'] not in category_titles_by_category:
                   category_titles_by_category[category['title']] = set()
               category_titles_by_category[category['title']].add(result['title'])
               progress += category['title'] + ', '
           print(progress.rstrip(', '))
   print('\n\n')
   print(page_titles_by_category)
   print('\n\n')
   print(category_titles_by_category)
   for category_title in sorted(category_titles_by_category['Category:Content']):
       category_file_name = .join([i for i in category_title if i.isalpha()])
       category_output_files[category_file_name] = open('output/' + category_file_name + '.mediawiki', 'w')
       print_titles(category_file_name, category_title)

def print_titles(category_file_name, category_title, category_level='=', indent_level=):

   all_pages_output_file.write(category_level + ' [' + base_url + category_title.replace(' ', '_') + ' ' + indent_level + category_title.split('Category:', 1).pop() + '] ' + category_level + '\n\n')
   category_output_files[category_file_name].write(category_level + ' [' + base_url + category_title.replace(' ', '_') + ' ' + indent_level + category_title.split('Category:', 1).pop() + '] ' + category_level + '\n\n')
   category_level += '='
   indent_level += '    '
   if category_title in page_titles_by_category:
       for page_title in sorted(page_titles_by_category[category_title]):
           all_pages_output_file.write('[' + base_url + page_title.replace(' ', '_') + ' ' + indent_level + page_title + ']\n\n')
           category_output_files[category_file_name].write('[' + base_url + page_title.replace(' ', '_') + ' ' + indent_level + page_title + ']\n\n')
   if category_title in category_titles_by_category:
       for child_category_title in sorted(category_titles_by_category[category_title]):
           print_titles(category_file_name, child_category_title, category_level, indent_level)

if __name__ == "__main__":

   main()

Discussion

Nice technical page, but I must ask is there any use for it? At least for me this seems quite useless, causing hinder rather than help. –Kapteeni Ruoska (talk) 06:11, 7 September 2016 (CEST)

I wanted to make sure there wasn't anything I was missing. When I'm browsing through the list it's easier to click these links than copy pasting the auto-generated titles from the api query.
--imahero 03:14, 8 September 2016 (CEST)
Sure, just wondering, as the wiki already has Special:AllPages, but perhaps there is a use for that. –Kapteeni Ruoska (talk) 07:27, 8 September 2016 (CEST)
Special:AllPages felt too clumsy for me to navigate. All I really wanted was a single list of all nonredirect links :)
--imahero 08:31, 8 September 2016 (CEST)

I may want to go back to using the external link syntax if breaking the orphaned pages is a big deal. For now I've just added the orphaned pages here and I'll just update the list every time I generate the AllPages list.
--imahero 15:19, 8 September 2016 (CEST)

I changed it back to use the external link syntax to make sure the orphaned pages list still works properly.
--imahero 22:48, 8 September 2016 (CEST)

It's now sorted by category.
--imahero 12:26, 10 September 2016 (CEST)