wikiTools package

Functions

Functions to obtain a list of Wikidata entities

w_SearchByLabel(string, mode=‘inlabel’, langs=““, langsorder=’’, instanceof=”“, Pproperty=”“, debug=FALSE)

w_SearchByOccupation(Qoc, mode=c(‘default’,‘count’,‘wikipedias’), langsorder=’‘, wikilangs=’’, nlimit=10000, debug=FALSE)

Function to obtain information from a list of Wikidata entities or a single one.

w_isInstanceOf(entity_list, instanceof=’’, nlimit=50000, debug=FALSE)

w_Wikipedias(entity_list, wikilangs=““, instanceof=’’, nlimit=1500, debug=FALSE)

w_isValid(entity_list, nlimit=50000, debug=FALSE)

w_Property(entity_list, Pproperty, includeQ=FALSE, langsorder=‘en’, nlimit=10000, debug=FALSE)

w_SearchByAuthority(Pauthority, langsorder=’‘, instanceof=’’, nlimit=10000, debug=FALSE)

Pauthority = Authority Database Property in Wikidata

w_EntityInfo(entity_list, mode=‘default’, langsorder=’’, wikilangs=““, nlimit=MW_LIMIT, debug=FALSE)

Functions to obtain information using the WikiMedia API’s

m_Opensearch(string, project=‘en.wikipedia.org’, profile=“engine_autoselect”, redirects=“resolve”)

m_reqMediaWiki(titles, mode=c(‘wikidataEntity’,‘redirects’,‘pagePrimaryImage’,‘pageFiles’), project=‘en.wikipedia.org’, redirects=TRUE, exclude_ext=‘svg|webp|xcf’)

m_Pageviews(article, start, end, project=“en.wikipedia.org”, access=“all-access”, agent=“user”, granularity=“monthly”, redirects=FALSE)

m_XtoolsInfo(article, infotype=c(“articleinfo”, “prose”, “links”), project=“en.wikipedia.org”, redirects=FALSE)

Functions to obtain information (viafID or cluster records) using the VIAF API

v_AutoSuggest(author) : obtains viafID

v_Search(CQL_Query, mode=c(‘default’, ‘anyField’, ‘allmainHeadingEl’, ‘allNames’, ‘allPersonalNames’, ‘allTitle’), schema=c(‘brief’, ‘JSON’)) : obtains clusters records

Function to retrieve a cluster record using the viafID.

v_GetRecord(viafid, record_format=‘viaf.json’): retrieve a cluster record

Function to extract information from a VIAF cluster record

v_Extract(viaf, info, source=NULL)

Package installation and loading

To install and load the updated version of the wikiTools package simply run the following commands:

install.packages("wikiTools")

library(wikiTools)

Examples of Wikidata functions using WDQS

Search string “Iranzo” in different positions

Exact search in Label or exact search in AltLabel (case sensitive and diacritics)

Optional: limit by instanceof Wikidata class (Qxx).

Optional: return information of some properties (Pproperties, Pxxx).

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5|Q101352')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5',
                      Pproperty = 'P21|P569|P570')

Search at the beginning in Label or AltLabel (diacritics and case are ignored)

df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='es|en', mode='startswith')
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='es|en', instanceof = 'Q5',
                      mode='startswith')
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='es|en',
                      instanceof = 'Q5|Q101352', mode='startswith')
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='en', instanceof = 'Q5',
                      Pproperty = 'P21|P569|P570', mode='startswith')

Search in any position in Label or AltLabel (diacritics and case are ignored)

If lang==’’ search in any language, else the search is performed only in the language indicated.

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', mode='inlabel')

Search only in Chinese (Simplified) (language code: zh):

df <- w_SearchByLabel(string='Iranzo', langsorder='zh|es', lang='zh', mode='inlabel')

Optional instanceof and Property

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5',
                      mode='inlabel')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5|Q101352',
                      mode='inlabel')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5',
                      Pproperty = 'P21|P569|P570', mode='inlabel')

aux: getting a vector of entities (`l`) to use later.

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', mode='inlabel')
if(length(df)){
  l <- df$entity
}

w_isInstanceOf

Check if elements in entity_list are instance of a Wikimedia class

df <- w_isInstanceOf(entity_list=l, instanceof='Q5')
# Not TRUE
if(length(df)){
  df[!df$instanceof_Q5,]
}

##                entity       instanceof instanceof_Q5
## Q6058550     Q6058550   Q16560|Q133215         FALSE
## Q11912738   Q11912738            Q3947         FALSE
## Q31835108   Q31835108        Q24529780         FALSE
## Q45976259   Q45976259          Q101352         FALSE
## Q45987474   Q45987474         Q4167410         FALSE
## Q47034606   Q47034606         Q1642895         FALSE
## Q67289998   Q67289998           Q38720         FALSE
## Q83296470   Q83296470                          FALSE
## Q85684513   Q85684513 Q28564|Q12317349         FALSE
## Q97101007   Q97101007          Q245117         FALSE
## Q97101009   Q97101009          Q245117         FALSE
## Q111015546 Q111015546             Q571         FALSE
## Q117783790 Q117783790          Q811430         FALSE
## Q125544306 Q125544306        Q47461344         FALSE
## Q125544313 Q125544313         Q3331189         FALSE
## Q131370779 Q131370779           Q79007         FALSE
## Q137732706 Q137732706         Q3393298         FALSE
## Q137774274 Q137774274         Q3393298         FALSE
## Q139797875 Q139797875                          FALSE
## Q140182784 Q140182784         Q1172284         FALSE

w_Wikipedias

Search for Wikipedia pages in all/some languages

Optional: instanceOF (limit to entities which are instance of a Wikidata class)

df <- w_Wikipedias(entity_list=l)
df <- w_Wikipedias(entity_list=l, wikilangs='es|en|fr')
df <- w_Wikipedias(entity_list=l, wikilangs='es|en|fr', instanceof="Q5")

w_SearchByOccupation

Count entities, or get the entities with that occupation, also get Wikipedia pages

Note: depending on connection speed, nlimit parameter musts be adjusted

w_SearchByOccupation(Qoc="Q2306091", mode='count') # "Q2306091" Qoc for Sociologist
q <- w_SearchByOccupation(Qoc="Q2306091")

lw <- w_SearchByOccupation(Qoc='Q2306091', mode='wikipedias') # lw=dataframe
# We can obtain the same information using previous function w_Wikipedias:
lw2 <- w_Wikipedias(entity_list=l)
# Verifying:
all(lw['Q10320558','pages'] == lw2['Q10320558','pages'])
# Verifying:
all(sort(strsplit(lw['Q9061', 'pages'], '|', fixed = T)[[1]]) ==
    sort(strsplit(lw2['Q9061', 'pages'], '|', fixed = T)[[1]]))

w_isValid.

Check if the Wikidata entities are valid. A entity is valid if it has a label or has a description. If one entity exists but is not valid, is possible that it has a redirection to other entity, in that case, the redirection is obtained. Other entities may have existed in the past, but they are currently deleted.

l2 <- append(l, c("Q115637688", "Q105660123"))  # Note: adding two new entities
v <- w_isValid(l2)
if(length(v)){
  # Not valid
  v[!v$valid,]
}

##                entity valid instanceof redirection
## Q105660123 Q105660123 FALSE              Q97352588
## Q115637688 Q115637688 FALSE                       
## Q139797875 Q139797875 FALSE

w_Property

Obtain properties of entity_list.

p <- w_Property(l, Pproperty = 'P21|P569|P214', langsorder = 'es|en')

w_SearchByAuthority

Search for Wikidata entities that have an identifier in the Wikidata authority property “Pauthority”.

Optional: instanceOf

Example: Pauthority=P4439 (has identifier in the Museo Nacional Centro de Arte Reina Sofía)

mncars   <- w_SearchByAuthority(Pauthority="P4439", langsorder = 'es|en')
# 1286  [human, groups, etc.]
mncarsQ5 <- w_SearchByAuthority(Pauthority="P4439", langsorder = 'es|en',
                                     instanceof = 'Q5')  # 1280
# Entities are not 'human' (Q5) [see entityDescription column):
mncars   <- w_SearchByAuthority(Pauthority="P4439", langsorder = 'es|en')
if(length(mncars) && length(mncarsQ5)){
  mncars[!(mncars$entity %in% mncarsQ5$entity),]  # not instance of Q5.
}

w_EntityInfo

Get some properties of a Wikidata entity.

df <- w_EntityInfo(entity_list='Q134644', langsorder='es|en')
df <- w_EntityInfo(entity_list='Q134644', langsorder='es|en', wikilangs='es|en|fr')
df <- w_EntityInfo(c('Q270510', 'Q1675466', 'Q24871'), mode='film', langsorder='es|en', wikilangs='es|en|fr')
# Search string 'van Beethoven' inlabel
w <- w_SearchByLabel('van Beethoven', mode='inlabel', langsorder = '', instanceof = 'Q5')
if(length(w)){
  df <- w_EntityInfo(w$entity, langsorder='en', wikilangs='en|es|fr', debug='info')
}
# Search 3D films
w <- w_SearchByInstanceof(instanceof='Q229390', langsorder = 'en|es', debug = 'info')
if(length(w)){
  df <- w_EntityInfo(w$entity, mode="film", langsorder='en', wikilangs='en', debug='info')
}

Examples of WikiMedia functions

m_Opensearch

Search articles that contains any words (note: it is better to use a large string)

Some search profiles:

df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org',
                   profile="engine_autoselect", redirects="resolve")
df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org', profile="strict")
df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org', profile="fuzzy")

m_reqMediaWiki

Checks if titles are in a Wikimedia project and returns the Wikidata entity for them, if they have one.

Note that URLdecode(“a%CC%8C”) is the letter “a” with the combining caron (ǎ)

df <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
                        mode='wikidataEntity', project='en.wikipedia.org')

Obtains the redirections of a page (the page itself can be a redirect to other page).

Returns a vector for each title, in each vector the first element is the destiny, rest are all pages that redirect to it.

a <- m_reqMediaWiki(c('Cervantes', 'Planck', 'Noexiste'), mode='redirects',
                    project='es.wikipedia.org')
a

## $Cervantes
##  [1] "Miguel de Cervantes"            "Miguel de Cerbantes"           
##  [3] "Miguel de Cervantes y Saavedra" "Miguel De Cervantes y Saavedra"
##  [5] "El manco de Lepanto"            "Miguel de cervantes"           
##  [7] "Manco de Lepanto"               "Don Miguel de Cervantes"       
##  [9] "Cervantino"                     "Cervantina"                    
## [11] "Miguel de Cervantes Saavedra"   "Cervantes Saavedra, Miguel de" 
## [13] "Miguel de Cervantes y Cortinas" "Cervantesco"                   
## [15] "Cervántico"                     "Cervantes"                     
## 
## $Planck
## [1] "Max Planck"                   "Planck"                      
## [3] "Max Karl Ernst Ludwig Planck"
## 
## $Noexiste
## [1] NA

Gets the URL of de Primary image as a URL of Wikimedia pages.

Gets all URL of files inserted in the pages (images, sounds, videos…), using ‘|’ as separator, and excluding some extensions in the exclude_ext parameter.

Both functions automatically resolve redirects (the destiny is the “normalized” column of the data-frame returned).

i <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
                  mode='pagePrimaryImage')

f <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
                  mode='pageFiles', exclude_ext = "svg|webp|xcf")

m_Pageviews

Gets visits that a page have had in a date interval

Optional: redirects

v <-  m_Pageviews(article="Miguel de Cervantes", start="20230101", end="2026028",
                   project="es.wikipedia.org", granularity="monthly")

m_XtoolsInfo

Obtains information (as vector) about an article in the Wikimedia project.

Infotype: articleinfo, prose, links

Optional: redirects

x <-  m_XtoolsInfo(article="Cervantes", infotype="articleinfo", project="es.wikipedia.org")
xx <- m_XtoolsInfo(article="Cervantes", infotype="articleinfo", project="es.wikipedia.org",
                   redirects=TRUE)

y <-  m_XtoolsInfo(article="Miguel de Cervantes", infotype="links", project="es.wikipedia.org")
yy <- m_XtoolsInfo(article="Cervantes", infotype="links", project="es.wikipedia.org",
                    redirects=TRUE)

Gets all information (articleinfo, prose, links).

z  <- m_XtoolsInfo(article="Miguel de Cervantes", infotype="all", project="es.wikipedia.org")
zz <- m_XtoolsInfo(article="Cervantes", infotype="all", project="es.wikipedia.org",
                       redirects=TRUE)

Examples using VIAF functions

v_AutoSuggest

Searches authors. Sometimes the same author appears several times, under a different name.

Return a data-frame.

Important: The API returns a maximum of 10 records.

v_AutoSuggest('Iranzo')
v_AutoSuggest('Esparza, María')
v_AutoSuggest('Escobar, Modesto')
# Note that four rows are returned, but only two different viafids.

v_Search and v_Extract

Search using CQL_Query

Search in any field (cql.any)

Operator is “=”: so search all terms and only those ones:

CQL_Query <- 'cql.any = "García Iranzo, Juan"'
r <- v_Search(CQL_Query)
# r contains complete VIAF records (sometimes seen as a "cluster record",
# which is unified by combining records from many libraries around the world)
v_Extract(r)

## $`Iranzo, G., 1918-1998`
## $`Iranzo, G., 1918-1998`$Name
## [1] "Iranzo, G., 1918-1998"
## 
## $`Iranzo, G., 1918-1998`$vid
## [1] "49001761"
## 
## $`Iranzo, G., 1918-1998`$gender
## [1] "male"
## 
## $`Iranzo, G., 1918-1998`$dates
## [1] "1918:1998"
## 
## $`Iranzo, G., 1918-1998`$sources
##                                       text                BNC             ISNI
## 1                    Iranzo, G., 1918-1998 981058518730306706 0000000061151209
## 2            Maravillas, Juanito 1921-1998               <NA>             <NA>
## 3            García Iranzo, Juan 1918-1998               <NA>             <NA>
## 4 Juan García Iranzo historietista español               <NA>             <NA>
## 5               Maravillas, Juanito, 1921-               <NA>             <NA>
##                  BNE        DNB      WKP           LC
## 1               <NA>       <NA>     <NA>         <NA>
## 2 981060879838108606       <NA>     <NA>         <NA>
## 3               <NA> 1280548487     <NA>         <NA>
## 4               <NA>       <NA> Q9015544         <NA>
## 5               <NA>       <NA>     <NA> no2007032440
## 
## $`Iranzo, G., 1918-1998`$titles
##  [1] "Antonio Barbas Heredia"                                                   
##  [2] "El Cachorro y los buitres del Mar Caribe"                                 
##  [3] "El Capitán coraje"                                                        
##  [4] "Cómete una tontería ; En una jaula de cristal : fandangos"                
##  [5] "Dick Relámpago : el rey de la pradera"                                    
##  [6] "La familia Pepe"                                                          
##  [7] "Historia del flamenco, v. 31 [SR] 1996:"                                  
##  [8] "El Hombre de los espacios"                                                
##  [9] "Juanito Maravillas"                                                       
## [10] "Mis cantes de oro"                                                        
## [11] "Nueva ilusión"                                                            
## [12] "Obra selecta."                                                            
## [13] "Pancho Colate"                                                            
## [14] "Piropos andaluces"                                                        
## [15] "Porque me diste esperanza : milonga ; Malditos sean los celos : fandangos"
## [16] "Pregunté a una jardinera ; Están preciosas las flores"                    
## [17] "Los reyes del cante"                                                      
## [18] "Sierra de Cazorla"                                                        
## [19] "Te deja vivir tranquilo : fandangos ; Yo no pensaba siquiera : fandangos" 
## [20] "La Verdadera historia de Almuñécar"                                       
## [21] "Y cuando llegue a Graná : granaínas ; Lo aprendí en La Carolina : taranta"
## 
## $`Iranzo, G., 1918-1998`$occupations
##         BNE
## 1 cantaores
## 
## $`Iranzo, G., 1918-1998`$coauthors
##                          coauthor
## 1   Gómez Sodi, Domingo 1904-1989
## 2  Vicente, el Granaíno 1927-2017
## 3      Cruz Rueda, Juan 1925-2001
## 4                 Macabich, Jorge
## 5        Conde Martín, Luis 1940-
## 6   Valderrama, Juanito 1916-2004
## 7    Simón, Paquito 1931-ca. 1970
## 8         Paco de Lucía 1947-2014
## 9       Murillo, Camilo 1926-2007
## 10       Martínez, Pepe 1923-1985
## 
## $`Iranzo, G., 1918-1998`$wikipedias
## [1] "https://ca.wikipedia.org/wiki/Juan_García_Iranzo"
## [2] "https://es.wikipedia.org/wiki/Juan_García_Iranzo"
## [3] "https://fr.wikipedia.org/wiki/G._Iranzo"

Shortcut

r <- v_Search("García Iranzo, Juan", mode="anyField")
v_Extract(r)

## $`Iranzo, G., 1918-1998`
## $`Iranzo, G., 1918-1998`$Name
## [1] "Iranzo, G., 1918-1998"
## 
## $`Iranzo, G., 1918-1998`$vid
## [1] "49001761"
## 
## $`Iranzo, G., 1918-1998`$gender
## [1] "male"
## 
## $`Iranzo, G., 1918-1998`$dates
## [1] "1918:1998"
## 
## $`Iranzo, G., 1918-1998`$sources
##                                       text                BNC             ISNI
## 1                    Iranzo, G., 1918-1998 981058518730306706 0000000061151209
## 2            Maravillas, Juanito 1921-1998               <NA>             <NA>
## 3            García Iranzo, Juan 1918-1998               <NA>             <NA>
## 4 Juan García Iranzo historietista español               <NA>             <NA>
## 5               Maravillas, Juanito, 1921-               <NA>             <NA>
##                  BNE        DNB      WKP           LC
## 1               <NA>       <NA>     <NA>         <NA>
## 2 981060879838108606       <NA>     <NA>         <NA>
## 3               <NA> 1280548487     <NA>         <NA>
## 4               <NA>       <NA> Q9015544         <NA>
## 5               <NA>       <NA>     <NA> no2007032440
## 
## $`Iranzo, G., 1918-1998`$titles
##  [1] "Antonio Barbas Heredia"                                                   
##  [2] "El Cachorro y los buitres del Mar Caribe"                                 
##  [3] "El Capitán coraje"                                                        
##  [4] "Cómete una tontería ; En una jaula de cristal : fandangos"                
##  [5] "Dick Relámpago : el rey de la pradera"                                    
##  [6] "La familia Pepe"                                                          
##  [7] "Historia del flamenco, v. 31 [SR] 1996:"                                  
##  [8] "El Hombre de los espacios"                                                
##  [9] "Juanito Maravillas"                                                       
## [10] "Mis cantes de oro"                                                        
## [11] "Nueva ilusión"                                                            
## [12] "Obra selecta."                                                            
## [13] "Pancho Colate"                                                            
## [14] "Piropos andaluces"                                                        
## [15] "Porque me diste esperanza : milonga ; Malditos sean los celos : fandangos"
## [16] "Pregunté a una jardinera ; Están preciosas las flores"                    
## [17] "Los reyes del cante"                                                      
## [18] "Sierra de Cazorla"                                                        
## [19] "Te deja vivir tranquilo : fandangos ; Yo no pensaba siquiera : fandangos" 
## [20] "La Verdadera historia de Almuñécar"                                       
## [21] "Y cuando llegue a Graná : granaínas ; Lo aprendí en La Carolina : taranta"
## 
## $`Iranzo, G., 1918-1998`$occupations
##         BNE
## 1 cantaores
## 
## $`Iranzo, G., 1918-1998`$coauthors
##                          coauthor
## 1   Gómez Sodi, Domingo 1904-1989
## 2  Vicente, el Granaíno 1927-2017
## 3      Cruz Rueda, Juan 1925-2001
## 4                 Macabich, Jorge
## 5        Conde Martín, Luis 1940-
## 6   Valderrama, Juanito 1916-2004
## 7    Simón, Paquito 1931-ca. 1970
## 8         Paco de Lucía 1947-2014
## 9       Murillo, Camilo 1926-2007
## 10       Martínez, Pepe 1923-1985
## 
## $`Iranzo, G., 1918-1998`$wikipedias
## [1] "https://ca.wikipedia.org/wiki/Juan_García_Iranzo"
## [2] "https://es.wikipedia.org/wiki/Juan_García_Iranzo"
## [3] "https://fr.wikipedia.org/wiki/G._Iranzo"

Search in 1xx, 4xx, 5xx fields of MARC record (local.names)

Operator is “all”: search all terms

CQL_Query <- 'local.names all "Figuerola"'
r <- v_Search(CQL_Query)