json - Getting table in R from HTMLInternalDocument object -
i have download several tables website, table id "tabela", tried various functions xml::readhtmltable, xml::xmltreeparse, rvest package loads :
require(rvest) url="http://www.pse.pl/index.php?modul=21&id_rap=2&data=2013-01-01" wpkd <- html(url) class(wpkd) [1] "htmlinternaldocument" "htmlinternaldocument" "xmlinternaldocument" "xmlabstractdocument" str(wpkd) classes 'htmlinternaldocument', 'htmlinternaldocument', 'xmlinternaldocument', 'xmlabstractdocument' <externalptr> now extract table "tabela" id or save wpkd plain text , try low level extraction. structure of wpkd isn't recognised :
> wpkd %>% xml_structure() {dtd} <html> <head> <title> {text} <meta [http-equiv, content]> <meta [name, content]> <meta [name, content]> <link [rel, type, title, href]> <meta [name, content]> <meta [name, content]> <meta [http-equiv, content]> <meta [http-equiv, content]> <link [rel, type, href]> <link [rel, href, type]> <link [rel, type, href]> <link [rel, href, type, media]> <link [rel, href, type, media]> <link [rel, href, type, media]> <script [src]> <script [src]> error: unknown input xmlinternalcommentnode/xmlinternalnode/xmlabstractnode
given "header" not uniform (spanned trs) here 1 way (it's not way):
library(rvest) library(magrittr) library(dplyr) pg <- html("http://www.pse.pl/index.php?modul=21&id_rap=2&data=2013-01-01") # small function extract column get_col <- function(doc, i) { skip <- ifelse(i==8, -1, -2) # last column "wonky" doc %>% html_nodes(xpath=sprintf("//table[@id='tabela']/tr/td[%d]", i)) %>% extract(-1:skip) %>% # skip useless "tr"s html_text() } # manually build data frame, gives better column names data.frame(time=pg %>% get_col(1), demand=pg %>% get_col(2), capacity_jwcd=pg %>% get_col(3), capacity_njwcd=pg %>% get_col(4), generation_jwcd=pg %>% get_col(5), generation_njwcd=pg %>% get_col(6), reserve_over=pg %>% get_col(7), reserve_below=pg %>% get_col(8), stringsasfactors=false) -> energy glimpse(energy) ## observations: 24 ## variables: ## $ time (chr) "1", "2", "3", "4", "5", "6", "7", "8", "9", "10... ## $ demand (chr) "14 650", "14 000", "13 325", "12 850", "12 575"... ## $ capacity_jwcd (chr) "21 032", "21 032", "21 032", "21 032", "21 032"... ## $ capacity_njwcd (chr) "8 918", "8 918", "8 918", "8 918", "8 918", "8 ... ## $ generation_jwcd (chr) "7 085", "6 446", "5 777", "5 307", "5 031", "4 ... ## $ generation_njwcd (chr) "7 565", "7 554", "7 548", "7 543", "7 544", "7 ... ## $ reserve_over (chr) "1 328", "1 269", "1 209", "1 166", "1 141", "1 ... ## $ reserve_below (chr) "-1 328", "-1 269", "-1 209", "-1 166", "-1 141"... you'll need type conversions on own, though (and if used 1 of auto-table functions provided worked).
Comments
Post a Comment