install.packages("htmltab")
library(htmltab)
# check to see if it is installed in virtual desktop
# right click, then view page source.
# Some commands may be difficult to follow unless
# if you understand XML
# https://www.w3schools.com/xml
url <- "http://en.wikipedia.org/wiki/World_population"
# two tables that are captioned with World historical
xp <- "//caption[starts-with(text(),'World historical')]/ancestor::table"
# htmltab assembles a data frame from an html table
df=htmltab(doc = url, which = xp)
df
class(df)
dim(df)
# The entries in df contained commas.
# Create a function to remove them.
popFun <- function(node) {
x <- XML::xmlValue(node)
gsub(',', '', x)
}
df=htmltab(doc = url, which = xp, bodyFun = popFun)
df
# This table also lacks header information.
# We provide them through colNames.
# We also need to set header = 0 to indicate that no header is present.
doc <- "http://en.wikipedia.org/wiki/FC_Bayern_Munich"
# In this example there is no caption,
# but first row second entry is Head coach
# contained in td, contained in table
xp2 <- "//td[text() = 'Head coach']/ancestor::table"
# view page source,
htmltab(doc = doc, which = xp2, header = 0, encoding = "UTF-8", colNames = c("name", "role"))
# htmltab recognizes column spans and produces a one-dimension vector of variable information,
# also removes automatically superscript information since these are usually not of use.
doc <- "http://en.wikipedia.org/wiki/Usage_share_of_web_browsers"
xp3 <- "//table[7]"
bFun <- function(node) {
x <- XML::xmlValue(node)
gsub('%$', '', x)
}
htmltab(doc = doc, which = xp3, bodyFun = bFun)
htmltab("https://en.wikipedia.org/wiki/Arjen_Robben", which = 3, header = 1:2)
#When header information appear throughout the body, you can specify their
#position in the header formula
htmltab(url, which = "//table[@id='team_gamelogs']", header = . + "//td[./strong]")
# regular expressions?