install.packages("htmltab") library(htmltab) # check to see if it is installed in virtual desktop # right click, then view page source. # Some commands may be difficult to follow unless # if you understand XML # https://www.w3schools.com/xml url <- "http://en.wikipedia.org/wiki/World_population" # two tables that are captioned with World historical xp <- "//caption[starts-with(text(),'World historical')]/ancestor::table" # htmltab assembles a data frame from an html table df=htmltab(doc = url, which = xp) df class(df) dim(df) # The entries in df contained commas. # Create a function to remove them. popFun <- function(node) { x <- XML::xmlValue(node) gsub(',', '', x) } df=htmltab(doc = url, which = xp, bodyFun = popFun) df # This table also lacks header information. # We provide them through colNames. # We also need to set header = 0 to indicate that no header is present. doc <- "http://en.wikipedia.org/wiki/FC_Bayern_Munich" # In this example there is no caption, # but first row second entry is Head coach # contained in td, contained in table xp2 <- "//td[text() = 'Head coach']/ancestor::table" # view page source, htmltab(doc = doc, which = xp2, header = 0, encoding = "UTF-8", colNames = c("name", "role")) # htmltab recognizes column spans and produces a one-dimension vector of variable information, # also removes automatically superscript information since these are usually not of use. doc <- "http://en.wikipedia.org/wiki/Usage_share_of_web_browsers" xp3 <- "//table[7]" bFun <- function(node) { x <- XML::xmlValue(node) gsub('%$', '', x) } htmltab(doc = doc, which = xp3, bodyFun = bFun) htmltab("https://en.wikipedia.org/wiki/Arjen_Robben", which = 3, header = 1:2) #When header information appear throughout the body, you can specify their #position in the header formula htmltab(url, which = "//table[@id='team_gamelogs']", header = . + "//td[./strong]") # regular expressions?