1 | initial version |
There are several methods for extracting data from a document in R using scraping techniques:
Example:
library(rvest)
url <- "https://www.example.com"
html <- read_html(url)
# Extract all links from the document
links <- html %>%
html_nodes("a") %>%
html_attr("href")
# Extract all paragraphs from the document
paras <- html %>%
html_nodes("p") %>%
html_text()
Example:
library(RSelenium)
driver <- rsDriver(browser="chrome")
remote_driver <- driver[["client"]]
remote_driver$navigate("https://www.example.com")
el <- remote_driver$findElement(using = 'xpath', "//a[text()='Login']")
el$clickElement()
# extract data from the login page
username <- remote_driver$findElement(using = 'id', "username")
password <- remote_driver$findElement(using = 'id', "password")
username$sendKeysToElement(list("my_username"))
password$sendKeysToElement(list("my_password"))
submit <- remote_driver$findElement(using = 'xpath', "//button[@type='submit']")
submit$clickElement()
# extract data from the logged-in page
data <- remote_driver$findElement(using = 'xpath', "//div[@class='data']")
text <- data$getElementText()
Example:
text <- readLines("my_file.txt")
data <- gsub("\\s+", ",", text) # replace all whitespaces with commas
data <- scan(text = data, sep = ",")