initial approach of parsing using FromElement

This commit is contained in:
Jonas Maier 2023-09-12 00:29:42 +02:00
parent 3102487eaf
commit 0fdc116594
19 changed files with 538 additions and 88 deletions

5
.gitignore vendored
View File

@ -1,2 +1,3 @@
/target
/Cargo.lock
polonium/target
polonium-macros/target
Cargo.lock

View File

@ -0,0 +1,14 @@
[package]
name = "polonium-macros"
version = "0.1.0"
edition = "2021"
[lib]
proc-macro = true
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
quote = "1.0"
proc-macro2 = "1.0.6"
syn = { version = "2.0", feature = ["full", "parsing"] }

205
polonium-macros/src/lib.rs Normal file
View File

@ -0,0 +1,205 @@
use proc_macro::TokenStream;
use proc_macro2::{Span, TokenStream as TokenStream2};
use quote::{quote, quote_spanned, ToTokens};
use syn::{parse_macro_input, spanned::Spanned, Attribute, DeriveInput, Field, Path};
macro_rules! propagate {
($result:expr) => {
match $result {
Ok(o) => o,
Err(e) => return e,
}
};
}
// TODO: actually also support parse_inner and parse_outer
#[proc_macro_derive(FromElement, attributes(css, parse, try_parse, inner, outer, elem, text, attr))]
pub fn impl_from_element(tokens: TokenStream) -> TokenStream {
let krate = quote!{polonium};
let input = parse_macro_input!(tokens as DeriveInput);
let input_span = input.span();
let input_struct = match input.data {
syn::Data::Struct(s) => s,
syn::Data::Enum(_) | syn::Data::Union(_) => {
return quote_spanned! {input_span =>
compile_error!("cannot derive FromElement on Enums or Unions.");
}
.into();
}
};
let css = propagate!(get_css(input_span, &input.attrs));
let mut field_css = vec![];
let mut field_creation = vec![];
for (idx, field) in input_struct.fields.iter().enumerate() {
let field_name = field.ident.as_ref().expect("field with no identifier.");
let css = propagate!(get_css(field.span(), &field.attrs));
let source = propagate!(get_source(&field));
let source = source.build_expr(quote! {elements[#idx]});
let field_value =
if let Ok(parser) = get_value(field.span(), &field.attrs, "parse") {
match parser {
Some(parser) => quote! {#parser(#source)},
None => quote!{#source.clone().into()},
}
} else if let Ok(parser) = get_value(field.span(), &field.attrs, "try_parse") {
match parser {
Some(parser) => quote! {#parser(#source).map_err(|e| #krate::element::InstantiationError::Parse(Box::new(e)))?},
None => quote!{#source.parse().map_err(|e| #krate::element::InstantiationError::Parse(Box::new(e)))?},
}
} else {
quote! {#source.clone().into()}
};
let field_span = field.span();
let field_value = quote_spanned!(field_span => #field_value);
field_css.push(css);
field_creation.push(quote! {#field_name: #field_value});
}
let struct_name = &input.ident;
let field_amt = input_struct.fields.len();
quote_spanned! {input_span =>
#[allow(unused)]
impl #krate::element::FromElement for #struct_name {
fn locator() -> #krate::locator::Locator {
#krate::locator::by::css(#css)
}
fn children() -> Vec<#krate::locator::Locator> {
vec![#(#krate::locator::by::css(#field_css)),*]
}
fn create(elements: #krate::element::QueryResult) -> Result<Self, #krate::element::InstantiationError> {
// ignoring root element for now.
let elements = &elements.elements()[1..];
if elements.len() != #field_amt {
panic!("wrong amount of fields given.");
}
Ok(Self {#(
#field_creation
),*})
}
}
}
.into()
}
fn get_css(span: Span, attrs: &[Attribute]) -> Result<TokenStream2, TokenStream> {
Ok(get_value(span, attrs, "css")?.ok_or(quote_spanned!(span => compile_error!("CSS attribute needs a value.")))?)
}
fn get_value(span: Span, attrs: &[Attribute], key: &str) -> Result<Option<TokenStream2>, TokenStream> {
let matches_key = |path: &Path| {
path.get_ident()
.map(|i| &i.to_string() == key)
.unwrap_or(false)
};
let value1 = attrs
.into_iter()
.filter_map(|a| a.meta.require_name_value().ok())
.filter(|m| matches_key(&m.path))
.map(|m| m.value.to_token_stream())
.next();
let value2 = attrs
.into_iter()
.filter_map(|a| a.meta.require_list().ok())
.filter(|l| matches_key(&l.path))
.map(|m| m.tokens.clone())
.next();
let value3 = attrs.into_iter().filter(|a| matches_key(&a.path())).next();
if let Some(v) = value1.or(value2) {
Ok(Some(v))
} else if let Some(..) = value3 {
Ok(None)
} else {
let error = quote_spanned! {span =>
compile_error!(concat!("did not find attribute `", #key, "`."));
};
Err(error.into())
}
}
fn get_source(field: &Field) -> Result<Source, TokenStream> {
let is_source_attr = |a: &Attribute| {
a.path()
.get_ident()
.map(|i| {
let name = i.to_string();
["attr", "text", "inner", "outer", "elem"].contains(&name.as_str())
})
.unwrap_or(false)
};
let source_attrs = field
.attrs
.iter()
.filter(|a| is_source_attr(&a))
.collect::<Vec<_>>();
if source_attrs.is_empty() {
Ok(Source::Text)
} else if source_attrs.len() == 1 {
let attr = &source_attrs[0];
let attr_name = attr.path().get_ident().unwrap().to_string();
let attr_name = attr_name.as_str();
let span = attr.span();
match attr_name {
"text" => Ok(Source::Text),
"inner" => Ok(Source::InnerHtml),
"outer" => Ok(Source::OuterHtml),
"elem" => Ok(Source::Elem),
"attr" => {
if let Ok(attr_list) = attr.meta.require_list() {
Ok(Source::Attribute(attr_list.tokens.clone().into()))
} else {
return Err(
quote_spanned! {span => compile_error!("please specify which attribute you want to match on.");}.into(),
);
}
}
_ => unreachable!(),
}
} else {
let span2 = source_attrs[1].span();
Err(quote_spanned! {span2 =>
compile_error!("this attribute conflicts with a previous one regarding source of field.");
}.into())
}
}
enum Source {
Attribute(TokenStream),
Text,
InnerHtml,
OuterHtml,
Elem,
}
impl Source {
fn build_expr(&self, value: TokenStream2) -> TokenStream2 {
match self {
Source::Text => quote! {(#value).text()},
Source::InnerHtml => quote! {(#value).inner()},
Source::OuterHtml => quote!((#value).outer()),
Source::Elem => value,
Source::Attribute(attr) => {
let attr: TokenStream2 = attr.clone().into();
quote! {(#value).extract_attr(#attr)?}
}
}
}
}

View File

@ -14,3 +14,5 @@ serde = { version = "1.0.188", features = ["derive"] }
serde_json = "1.0.105"
tokio = { version = "1.32.0", features = ["rt-multi-thread", "macros", "time", "sync"] }
tokio-tungstenite = "0.20.0"
polonium-macros = { path = "../polonium-macros" }
lol_html = "1.1.1"

View File

@ -1,5 +1,6 @@
use crate::cooldown::{Cooldown, RandomizedIntervalCooldown};
use crate::locator::By;
use crate::element::{Element, FromElement, QueryResult};
use crate::locator::Locator;
use crate::msg::*;
use crate::{error::Result, network::Connector, Ignore};
use std::process::Child;
@ -61,14 +62,14 @@ impl Browser {
self.con.call(tx_msg).await
}
pub async fn click(&mut self, location: By) -> Result<()> {
pub async fn click(&mut self, location: Locator) -> Result<()> {
let expr = location.js_expr();
let script = include_str!("js/click.js").replace("{{locator}}", &expr);
self.msg(script).await?;
Ok(())
}
pub async fn click_link(&mut self, location: By) -> Result<()> {
pub async fn click_link(&mut self, location: Locator) -> Result<()> {
self.click(location).await?;
self.con.wait_for_connection().await;
Ok(())
@ -83,3 +84,38 @@ impl Browser {
self.child.kill().ignore();
}
}
impl Browser {
pub async fn find_all<E: FromElement>(&mut self) -> Result<Vec<E>> {
let locators: Vec<String> = {
let root_locator = E::locator();
let mut child_locators = E::children();
let mut locators = Vec::with_capacity(1 + child_locators.len());
locators.push(root_locator);
locators.append(&mut child_locators);
locators
.into_iter()
.map(|l| match l {
Locator::Css(css) => css,
})
.collect()
};
let js_array = serde_json::to_string(&locators)?;
let js_expr = format!("findElements({js_array})");
let reply = self.msg(js_expr).await?;
let els: Vec<Vec<Element>> = serde_json::from_value(reply.result)?;
let mut res = vec![];
for el in els {
match E::create(QueryResult { elements: el }) {
Ok(val) => res.push(val),
Err(e) => todo!("log warning at the very least, anyways: {e:?}"),
}
}
Ok(res)
}
}

94
polonium/src/element.rs Normal file
View File

@ -0,0 +1,94 @@
use std::{error::Error, fmt::Display};
use lol_html::errors::RewritingError;
use serde::Deserialize;
use crate::locator::Locator;
#[derive(Deserialize, Clone, Debug)]
pub struct Element {
pub(crate) id: i32,
outer_html: String,
inner_html: String,
inner_text: String,
}
impl Element {
pub fn text(&self) -> &str {
&self.inner_text
}
pub fn inner(&self) -> &str {
&self.inner_html
}
pub fn outer(&self) -> &str {
&self.outer_html
}
pub fn extract_attr(&self, attr: &str) -> Result<String, InstantiationError> {
let mut value = None;
let css = format!("[{attr}]");
use lol_html::*;
let mut rewriter = HtmlRewriter::new(
Settings {
element_content_handlers: vec![element!("*", |el| {
if let Some(attribute) = el.get_attribute(attr) {
if value.is_none() {
value = Some(attribute);
}
}
Ok(())
})],
..Settings::default()
},
|_: &[u8]| {},
);
// TODO better solution than to replace inner html to avoid matching against that ...
let html = self.outer_html.clone().replace(self.inner(), "");
rewriter.write(self.outer_html.as_bytes())?;
rewriter.end()?;
value.ok_or(InstantiationError::AttributeNotFound)
}
}
/// To avoid instantiating by the library user themselves
pub struct QueryResult {
pub(crate) elements: Vec<Element>,
}
impl QueryResult {
#[doc(hidden)]
/// This is not part of the public API.
/// Do NOT use this.
pub fn elements(&self) -> &[Element] {
&self.elements
}
}
#[derive(Debug)]
pub enum InstantiationError {
AttributeNotFound,
Rewriting(RewritingError),
Parse(Box<dyn Error>),
}
impl Display for InstantiationError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{self:?}")
}
}
impl Error for InstantiationError {}
impl From<RewritingError> for InstantiationError {
fn from(value: RewritingError) -> Self {
Self::Rewriting(value)
}
}
pub trait FromElement: Sized {
fn locator() -> Locator;
fn children() -> Vec<Locator>;
fn create(elements: QueryResult) -> Result<Self, InstantiationError>;
}

View File

@ -0,0 +1,81 @@
const socket = new WebSocket("ws://127.0.0.1:{{port}}");
const environment = {
nonce: 1,
elementRefs: []
}
function getNonce() {
environment.nonce++;
return environment.nonce;
}
function selectElements(selector, root = null) {
if (root == null) {
root = document;
}
let result = [];
for (e of root.querySelectorAll(selector)) {
let f = {
id: getNonce(),
el: e
};
result.push(f);
environment.elementRefs.push(f);
}
return result;
}
function getElement(id) {
return environment.elementRefs.find(e => e.id === id);
}
function findElements(selectors) {
let els = [];
let roots = selectElements(selectors[0]);
let childSelectors = selectors.slice(1);
let results = [];
for (root of roots) {
try {
let res = findChildren(root, childSelectors);
results.push(res);
} catch (e) {
console.log(e)
}
}
return results;
}
function findChildren(root, childSelectors) {
let allElems = [root];
for (s of childSelectors) {
allElems.push(selectElements(s, root.el)[0]);
}
return allElems.map((e) => ({
id: e.id,
outer_html: e.el.outerHTML,
inner_html: e.el.innerHTML,
inner_text: e.el.innerText
}));
}
socket.addEventListener("message", (event) => {
console.log("Message from server ", event.data);
const msg = JSON.parse(event.data);
let reply = {};
try {
reply.id = msg.id;
reply.result = (1, eval)(msg.script);
} catch (e) {
reply.exception = e;
}
reply.html = document.documentElement.outerHTML;
reply = JSON.stringify(reply);
console.log(`Replied to Server Message (${reply.length})`)
console.log(reply);
socket.send(reply);
});
socket.addEventListener("close", () => {
console.log("Server Disconnected.")
});

View File

@ -1,14 +1,15 @@
use driver::Browser;
pub(crate) mod network;
pub(crate) mod chromium;
pub mod driver;
pub mod error;
pub mod msg;
pub mod locator;
pub mod cooldown;
pub mod driver;
pub mod element;
pub mod error;
pub mod locator;
pub mod msg;
pub(crate) mod network;
#[cfg(test)]
//#[cfg(test)]
mod tests;
trait Ignore {
@ -59,3 +60,5 @@ impl Default for Polonium {
Self::new("target/polonium-tmp-data")
}
}
pub use polonium_macros::FromElement;

19
polonium/src/locator.rs Normal file
View File

@ -0,0 +1,19 @@
pub enum Locator {
Css(String),
}
impl Locator {
pub(crate) fn js_expr(&self) -> String {
match self {
Self::Css(css) => format!("document.querySelectorAll('{css}')"),
}
}
}
pub mod by {
use super::Locator;
pub fn css(css: impl ToString) -> Locator {
Locator::Css(css.to_string())
}
}

View File

@ -180,7 +180,6 @@ async fn stream(tx: Sender<IMsg>, id: ConId, stream: TcpStream) {
loop {
select! {
incoming = stream.next() => {
println!("incoming msg: {incoming:?}");
if let Some(msg) = incoming {
let msg = msg?;
match msg {
@ -204,7 +203,6 @@ async fn stream(tx: Sender<IMsg>, id: ConId, stream: TcpStream) {
}
},
outgoing = rx.recv() => {
println!("outgoing msg: {outgoing:?}");
if let Some(OMsg::Message(msg)) = outgoing {
let json = serde_json::to_string(&msg)?;
stream.send(Message::Text(json)).await?;

72
polonium/src/tests.rs Normal file
View File

@ -0,0 +1,72 @@
use std::time::Duration;
use crate::element::Element;
use crate::{locator::by, *};
#[tokio::test]
async fn basic_test() -> crate::error::Result<()> {
let mut pol = Polonium::default();
let mut browser = pol.browser().await?;
browser
.goto("https://en.wikipedia.org/wiki/Germany")
.await?;
tokio::time::sleep(Duration::from_secs(2)).await;
let title = browser.title().await?;
browser.close().await;
assert!(title.contains("Germany"));
Ok(())
}
#[tokio::test]
async fn wiki_click() -> crate::error::Result<()> {
let mut pol = Polonium::default();
pol.next_port();
let mut browser = pol.browser().await?;
browser.goto("https://wikipedia.org/").await?;
// go to German wikipedia
browser.click_link(by::css("a#js-link-box-de")).await?;
tokio::time::sleep(Duration::from_secs(5)).await;
let title = browser.title().await?;
browser.close().await;
assert!(title.contains("Die freie Enzyklopädie"));
Ok(())
}
use crate as polonium;
#[derive(FromElement, Debug)]
#[css("body div")]
struct Example {
#[css("h1")]
title: String,
#[css("p")]
text: String,
}
#[tokio::test]
async fn structured_parsing() -> crate::error::Result<()> {
let mut pol = Polonium::default();
pol.next_port();
pol.next_port();
let mut browser = pol.browser().await?;
browser.goto("https://example.com").await?;
tokio::time::sleep(Duration::from_secs(5)).await;
let example_domain = browser.find_all::<Example>().await?;
println!("{example_domain:?}");
tokio::time::sleep(Duration::from_secs(5)).await;
browser.close().await;
let example_domain = &example_domain[0];
assert_eq!(example_domain.title, "Example Domain");
assert_eq!(example_domain.text, "This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.");
Ok(())
}

View File

@ -1,21 +0,0 @@
const socket = new WebSocket("ws://127.0.0.1:{{port}}");
socket.addEventListener("message", (event) => {
console.log("Message from server ", event.data);
const msg = JSON.parse(event.data);
let reply = {};
try {
reply.id = msg.id;
reply.result = eval(msg.script);
} catch (e) {
reply.exception = e;
}
reply.html = document.documentElement.outerHTML;
reply = JSON.stringify(reply);
socket.send(reply);
});
socket.addEventListener("close", () => {
console.log("Server Disconnected.")
});

View File

@ -1,14 +0,0 @@
pub enum By {
Css(String),
}
impl By {
pub fn css(css: impl ToString) -> Self {
Self::Css(css.to_string())
}
pub(crate) fn js_expr(&self) -> String {
match self {
By::Css(css) => format!("document.querySelectorAll('{css}')"),
}
}
}

View File

@ -1,40 +0,0 @@
use std::time::Duration;
use crate::{*, locator::By};
#[tokio::test]
async fn basic_test() -> crate::error::Result<()> {
let mut pol = Polonium::default();
let mut browser = pol.browser().await?;
browser
.goto("https://en.wikipedia.org/wiki/Germany")
.await?;
tokio::time::sleep(Duration::from_secs(2)).await;
let title = browser.title().await?;
browser.close().await;
assert!(title.contains("Germany"));
Ok(())
}
#[tokio::test]
async fn wiki_click() -> crate::error::Result<()> {
let mut pol = Polonium::default();
pol.next_port();
let mut browser = pol.browser().await?;
browser
.goto("https://wikipedia.org/")
.await?;
// go to German wikipedia
browser.click_link(By::css("a#js-link-box-de")).await?;
tokio::time::sleep(Duration::from_secs(5)).await;
let title = browser.title().await?;
browser.close().await;
assert!(title.contains("Die freie Enzyklopädie"));
Ok(())
}