Scraping Tropicos data in R

Just a little web scraper to mine species distribution data from the Tropicos botanical database.

The script scrapes the first returned result, or if there is only one result it will display the available data (a data frame of all available data). If not distribution data was found the function will return an NA value.

#' @param species: genus species or genus
#' @param quiet: TRUE / FALSE provides verbose output
#' @keywords Tropicos, species distribution
#' @export
#' @examples
#' # with defaults, outputting a data frame with species distribution
#' # for Clematis
#' df <- tropicos.species.distribution()
#' # returns NA if no data are present
#' [requires the rvest package for post-processing]

tropicos.species.distribution <- function(species='Clematis',quiet=TRUE){
  
  # read the required libraries
  require(rvest)
  
  species = gsub(" ", "+", species)
  base_url = "http://www.tropicos.org/NameSearch.aspx?name="
  url = sprintf("%s%s",base_url,species)
  
  # load the page into memory
  html_page = read_html(url)
  
  # first check if the page lists a table of species
  # or was redirected to the only search result presented
  check = html_page %>%
    html_nodes("#ctl00_footerControl_citationDiv") %>%
    html_text()
  
  sel = regexpr("(http://).*[0-9]",check)
  final_url = regmatches(check,sel)
  
  # if the length of the final_url is 1 then use the final_url
  # to move on, otherwise select the first listed species
  # and move to this page to list the distribution
  if(length(final_url)>0){
    
    print("test")
    url = sprintf("%s%s",final_url,"?tab=distribution")
    html_page = read_html(url)
    
    distribution = html_page %>% 
      html_nodes("#ctl00_MainContentPlaceHolder_nameDistributionsControl_gvwResults") %>%
      html_table() %>%
      data.frame()
    
  }else{
    
    # if multiple search results are present, pick the top of the list
    # and follow this link to the distribution data if available
    # load the first species in the table generated by the species search
    first_link_listed = html_page %>% 
      html_nodes("#ctl00_MainContentPlaceHolder_nameSearchControl_gridView tr:nth-child(2) td:nth-child(3)") %>%
      html_nodes("a") %>%
      html_attr("href")
    # clean up the string, grab the species indentifier number
    first_link_listed = gsub("[/Name/]", "", first_link_listed)
    
    # format the new string, linking to the species distribution
    url = sprintf("http://www.tropicos.org/Name/%s%s",first_link_listed,"?tab=distribution")
    html_page = read_html(url)
    
    # extract the species distribution in full from the page
    distribution = html_page %>% 
      html_nodes("#ctl00_MainContentPlaceHolder_nameDistributionsControl_gvwResults") %>%
      html_table() %>%
      data.frame()   
  }
  
  if(prod(dim(b))==0){
    distribution = NA
  }
  
  # return the data frame, verbose or not
  if (quiet == FALSE){
    print(distribution)
  }
  return(distribution)
}

 


© 2018. All rights reserved.

Powered by Hydejack v7.5.1