Rust
[Rust] Web Crawler 예제 (55일차)
꾸압
2023. 2. 8. 23:59
<예제 코드_1>
==> HTTP Request 생성
# Cargo.toml
# ...중략
[dependencies]
reqwest = { version = "0.11", features = ["json", "blocking"] } # Request with JSON parsing support
futures = "0.3" # for our async / await blocks
tokio = { version = "1.12.0", features = {"full"} } # for our async runtime
// src/main.rs
use std::io::Read;
fn main() {
let client = reqwest::blocking::Client::new();
let origin_url = "https://rolisz.ro/";
let mut res = client.get(origin_url).send().unwrap();
println!("Status for {}: {}", origin_url, res.status());
let mut body = String::new();
res.read_to_string(&mut body).unwrap();
println!("HTML: {}", &body[0..40]);
}
<예제 코드_2>
==> Link 추출
# Cargo.toml
# --- 중략
[dependencies]
reqwest = { version = "0.11", features = ["json", "blocking"] }
futures = "0.3"
tokio = { version = "1.12.0", featuers = ["full"] }
select = { git = "https://github.com/utkarshkukreti/select.rs" }
// src/main.rs
use std::io::Read;
use select::document::Document;
use select::predicate::Name;
fn main() {
let client = reqwest::blocking::Client::new();
let origin_url = "https://rolisz.ro/";
let mut res = client.get(origin_url).send().unwrap();
println!("Status for {}: {}", origin_url, res.status());
let mut body = String::new();
res.read_to_string(&mut body).unwrap();
Document::from(body.as_str())
.find(Name("a"))
.filter_map(|n| n.attr("href"))
.for_each(|x| println!("{}", x));
}
<예제 코드_3>
==> URL을 Hash로 가져오기
# Cargo.toml
# --- 중략
[dependencies]
reqwest = { version = "0.11", features = ["json", "blocking"] }
futures = "0.3"
tokio = { version = "1.12.0", features = ["full"] }
select = { git = "https://github.com/utkarshkukreti/select.rs" }
use std::io::Read;
use select::document::Document;
use select::predicate::Name;
use std::collections::HashSet;
fn main() {
let client = reqwest::blocking::Client::new();
let origin_url = "https://rolisz.ro/";
let mut res = client.get(origin_url).send().unwrap();
println!("Status for {}: {}", origin_url, res.status());
let mut body = String::new();
res.read_to_string(&mut body).unwrap();
let found_urls = Document::from(body.as_str())
.find(Name("a"))
.filter_map(|n| n.attr("href"))
.map(str::to_string)
.collect::<HashSet<String>>();
println!("URLs: {:?}", found_urls);
}
<참조 1> https://rolisz.ro/2020/03/01/web-crawler-in-rust/
<참조 2>