-
Notifications
You must be signed in to change notification settings - Fork 0
/
harvest_template.py
161 lines (143 loc) · 6.24 KB
/
harvest_template.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Usage:
python harvest_template.py -lang:nl -template:"Taxobox straalvinnige" orde P70 familie P71 geslacht P74
This will work on all pages that transclude the template in the article namespace
You can use any typical pagegenerator to provide with a list of pages
python harvest_template.py -lang:nl -cat:Sisoridae -template:"Taxobox straalvinnige" -namespace:0 orde P70 familie P71 geslacht P74
"""
#
# (C) 2013 Multichill, Amir
# (C) 2013 Pywikipediabot team
#
# Distributed under the terms of MIT License.
#
__version__ = '$Id$'
#
import re
import wikipedia as pywikibot
import pagegenerators
class HarvestRobot:
"""
A bot to add Wikidata claims
"""
def __init__(self, generator, templateTitle, fields):
"""
Arguments:
* generator - A generator that yields Page objects.
* templateTitle - The template to work on
* fields - A dictionary of fields that are of use to us
"""
self.generator = generator
self.templateTitle = templateTitle.replace(u'_', u' ')
self.pregen=pagegenerators.PreloadingGenerator(generator)
self.fields = fields
self.site=pywikibot.getSite()
self.repo = self.site.data_repository()
def setSource(self, lang):
'''
Get the source
'''
source_values = {'en': 'Q328',
'sv': 'Q169514',
'de': 'Q48183',
'it': 'Q11920',
'no': 'Q191769',
'fa': 'Q48952',
'ar': 'Q199700',
'es': 'Q8449',
'pl': 'Q1551807',
'ca': 'Q199693',
'fr': 'Q8447',
'nl': 'Q10000',
'pt': 'Q11921',
'ru': 'Q206855',
'vi': 'Q200180',
'be': 'Q877583',
'uk': 'Q199698',
'tr': 'Q58255',
} # TODO: Should be moved to a central wikidata library
if lang in source_values:
source = ('143',source_values.get(lang))
return source
else:
return None
def run(self):
"""
Starts the robot.
"""
for page in self.pregen:
self.procesPage(page)
def procesPage(self, page):
"""
Proces a single page
"""
item = pywikibot.DataPage(page)
pywikibot.output('Processing %s' % page)
if not item.exists():
pywikibot.output('%s doesn\'t have a wikidata item :(' % page)
#TODO FIXME: We should provide an option to create the page
else:
pagetext = page.get()
pagetext = pywikibot.removeDisabledParts(pagetext)
templates = pywikibot.extract_templates_and_params(pagetext)
for (template, fielddict) in templates:
# We found the template we were looking for
if template.replace(u'_', u' ')==self.templateTitle:
for field, value in fielddict.items():
# This field contains something useful for us
if field in self.fields:
# Check if the property isn't already set
claim = self.fields[field]
if claim in item.get().get('claims'):
pywikibot.output(u'A claim for %s already exists. Skipping' % (claim,))
#TODO FIXME: This is a very crude way of dupe checking
else:
# Try to extract a valid page
match = re.search(re.compile(r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]'), value)
if match:
try:
link = match.group(1)
linkedPage = pywikibot.Page(self.site, link)
if linkedPage.isRedirectPage():
linkedPage = linkedPage.getRedirectTarget()
linkedItem = pywikibot.DataPage(linkedPage)
pywikibot.output('Adding %s --> %s' % (claim, linkedItem.getID()))
if self.setSource(self.site().language()):
item.editclaim(str(claim), linkedItem.getID() ,refs={self.setSource(self.site().language())})
else:
item.editclaim(str(claim), linkedItem.getID() )
except pywikibot.NoPage:
pywikibot.output('[[%s]] doesn\'t exist so I can\'t link to it' % (linkedItem.title(),))
def main():
genFactory = pagegenerators.GeneratorFactory()
commandline_arguments = list()
templateTitle = u''
for arg in pywikibot.handleArgs():
if arg.startswith('-template'):
if len(arg) == 9:
templateTitle = pywikibot.input(
u'Please enter the template to work on:')
else:
templateTitle = arg[10:]
elif genFactory.handleArg(arg):
continue
else:
commandline_arguments.append(arg)
if len(commandline_arguments) % 2 or not templateTitle:
raise ValueError # or something.
fields = dict()
for i in xrange (0, len(commandline_arguments), 2):
fields[commandline_arguments[i]] = commandline_arguments[i+1]
if templateTitle:
gen = pagegenerators.ReferringPageGenerator(pywikibot.Page(pywikibot.getSite(),"Template:%s" % templateTitle ), onlyTemplateInclusion = True)
else:
gen = genFactory.getCombinedGenerator()
if not gen:
# TODO: Build a transcluding generator based on templateTitle
return
bot = HarvestRobot(gen, templateTitle, fields)
bot.run()
if __name__ == "__main__":
main()