json - Getting table in R from HTMLInternalDocument object -


i have download several tables website, table id "tabela", tried various functions xml::readhtmltable, xml::xmltreeparse, rvest package loads :

 require(rvest)  url="http://www.pse.pl/index.php?modul=21&id_rap=2&data=2013-01-01"  wpkd <- html(url)  class(wpkd) [1] "htmlinternaldocument" "htmlinternaldocument" "xmlinternaldocument"  "xmlabstractdocument"   str(wpkd) classes 'htmlinternaldocument', 'htmlinternaldocument', 'xmlinternaldocument', 'xmlabstractdocument' <externalptr> 

now extract table "tabela" id or save wpkd plain text , try low level extraction. structure of wpkd isn't recognised :

> wpkd %>% xml_structure() {dtd} <html>   <head>     <title> {text}     <meta [http-equiv, content]>     <meta [name, content]>     <meta [name, content]>     <link [rel, type, title, href]>     <meta [name, content]>     <meta [name, content]>     <meta [http-equiv, content]>     <meta [http-equiv, content]>     <link [rel, type, href]>     <link [rel, href, type]>     <link [rel, type, href]>     <link [rel, href, type, media]>     <link [rel, href, type, media]>     <link [rel, href, type, media]>     <script [src]>     <script [src]> error: unknown input xmlinternalcommentnode/xmlinternalnode/xmlabstractnode 

given "header" not uniform (spanned trs) here 1 way (it's not way):

library(rvest) library(magrittr) library(dplyr)  pg <- html("http://www.pse.pl/index.php?modul=21&id_rap=2&data=2013-01-01")  # small function extract column  get_col <- function(doc, i) {   skip <- ifelse(i==8, -1, -2) # last column "wonky"   doc %>%      html_nodes(xpath=sprintf("//table[@id='tabela']/tr/td[%d]", i)) %>%      extract(-1:skip) %>% # skip useless "tr"s     html_text() }  # manually build data frame, gives better column names  data.frame(time=pg %>% get_col(1),            demand=pg %>% get_col(2),            capacity_jwcd=pg %>% get_col(3),            capacity_njwcd=pg %>% get_col(4),            generation_jwcd=pg %>% get_col(5),            generation_njwcd=pg %>% get_col(6),            reserve_over=pg %>% get_col(7),            reserve_below=pg %>% get_col(8),            stringsasfactors=false) -> energy  glimpse(energy)  ## observations: 24 ## variables: ## $ time             (chr) "1", "2", "3", "4", "5", "6", "7", "8", "9", "10... ## $ demand           (chr) "14 650", "14 000", "13 325", "12 850", "12 575"... ## $ capacity_jwcd    (chr) "21 032", "21 032", "21 032", "21 032", "21 032"... ## $ capacity_njwcd   (chr) "8 918", "8 918", "8 918", "8 918", "8 918", "8 ... ## $ generation_jwcd  (chr) "7 085", "6 446", "5 777", "5 307", "5 031", "4 ... ## $ generation_njwcd (chr) "7 565", "7 554", "7 548", "7 543", "7 544", "7 ... ## $ reserve_over     (chr) "1 328", "1 269", "1 209", "1 166", "1 141", "1 ... ## $ reserve_below    (chr) "-1 328", "-1 269", "-1 209", "-1 166", "-1 141"...                

you'll need type conversions on own, though (and if used 1 of auto-table functions provided worked).


Comments

Popular posts from this blog

node.js - Using Node without global install -

How to access a php class file from PHPFox framework into javascript code written in simple HTML file? -

java - Null response to php query in android, even though php works properly -