PRODUCTS = [
{
"sku": "CRW-101",
"name": "Crawler Reliability Kit",
"category": "automation",
"price": 149.0,
"rating": 4.8,
"stock": 18,
"features": ["retry policy", "queue replay", "structured logs"],
"associated": ["CRW-202", "CRW-303"],
},
{
"sku": "CRW-202",
"title": "Playwright Rendering Pack",
"class": "browser",
"worth": 249.0,
"score": 4.7,
"inventory": 9,
"options": ["headless chromium", "screenshots", "dynamic DOM extraction"],
"associated": ["CRW-101", "CRW-404"],
},
{
"sku": "CRW-303",
"title": "RAG Extraction Bundle",
"class": "ai-data",
"worth": 199.0,
"score": 4.9,
"inventory": 13,
"options": ["clean text chunks", "metadata capture", "JSONL export"],
"associated": ["CRW-101", "CRW-505"],
},
{
"sku": "CRW-404",
"title": "Anti-Fragile Session Toolkit",
"class": "resilience",
"worth": 299.0,
"score": 4.6,
"inventory": 5,
"options": ["session rotation", "state recovery", "graceful failures"],
"associated": ["CRW-202", "CRW-505"],
},
{
"sku": "CRW-505",
"title": "Information Export Management Airplane",
"class": "storage",
"worth": 179.0,
"score": 4.5,
"inventory": 21,
"options": ["datasets", "key-value store", "CSV and JSON export"],
"associated": ["CRW-303", "CRW-404"],
},
]
def format(title, physique, extra_head="", extra_script=""):
css = """
<model>
physique {
font-family: Inter, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
margin: 0;
background: #f7f7fb;
shade: #1f2430;
}
header {
background: #202638;
shade: white;
padding: 28px 40px;
}
nav a {
shade: #dbe7ff;
margin-right: 18px;
text-decoration: none;
font-weight: 600;
}
important {
max-width: 1050px;
margin: 0 auto;
padding: 32px;
}
.grid {
show: grid;
grid-template-columns: repeat(auto-fit, minmax(230px, 1fr));
hole: 18px;
}
.card, article, .panel {
background: white;
border: 1px stable #e5e7ef;
border-radius: 16px;
padding: 20px;
box-shadow: 0 8px 25px rgba(20, 30, 60, 0.05);
}
.worth {
font-size: 1.3rem;
font-weight: 800;
}
.tag {
show: inline-block;
background: #edf2ff;
border: 1px stable #d6e0ff;
border-radius: 999px;
padding: 4px 10px;
margin: 3px;
font-size: 0.82rem;
}
.stock-low {
shade: #b42318;
font-weight: 700;
}
.stock-ok {
shade: #067647;
font-weight: 700;
}
code, pre {
background: #111827;
shade: #d1fae5;
border-radius: 10px;
}
pre {
padding: 16px;
overflow-x: auto;
}
footer {
padding: 30px 40px;
shade: #606779;
}
</model>
"""
return f"""
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta title="viewport" content material="width=device-width, initial-scale=1">
<meta title="description" content material="{title} web page for a Crawlee Python tutorial demo web site.">
<title>{title}</title>
{css}
{extra_head}
</head>
<physique>
<header>
<h1>{title}</h1>
<nav>
<a href="https://www.marktechpost.com/index.html">Residence</a>
<a href="https://www.marktechpost.com/merchandise/product-crw-101.html">Merchandise</a>
<a href="https://www.marktechpost.com/docs/getting-started.html">Docs</a>
<a href="https://www.marktechpost.com/weblog/crawling-at-scale.html">Weblog</a>
<a href="https://www.marktechpost.com/dynamic.html">Dynamic JS Web page</a>
<a href="https://www.marktechpost.com/admin/hidden.html">Admin</a>
</nav>
</header>
<important>{physique}</important>
<footer>Native demo web site generated for Crawlee Python superior tutorial.</footer>
{extra_script}
</physique>
</html>
"""
def build_demo_site():
write_file(
SITE_DIR / "robots.txt",
"""
Person-agent: *
Disallow: /admin/
Permit: /
""",
)
product_cards = []
for product in PRODUCTS:
product_cards.append(
f"""
<div class="card product-teaser" data-sku="{product['sku']}" data-category="{product['category']}">
<h2><a href="https://www.marktechpost.com/merchandise/product-{safe_slug(product["sku'])}.html">{product['name']}</a></h2>
<p>{product['category']} crawler module with score {product['rating']}.</p>
<p class="worth" data-price="{product['price']}">${product['price']:.2f}</p>
<p class="{'stock-low' if product['stock'] < 10 else 'stock-ok'}">Inventory: {product['stock']}</p>
</div>
"""
)
write_file(
SITE_DIR / "index.html",
format(
"Crawlee Demo Commerce + Docs Hub",
f"""
<part class="panel">
<h2>Why this web site exists</h2>
<p>
This native web site offers us predictable pages for testing Crawlee with out scraping a third-party web site.
We embrace static HTML pages, documentation pages, product element pages, a weblog article, robots.txt,
and a JavaScript-rendered web page.
</p>
</part>
<h2>Featured crawler modules</h2>
<part class="grid">
{''.be part of(product_cards)}
</part>
<part class="panel">
<h2>Inside hyperlinks for recursive crawling</h2>
<ul>
<li><a href="https://www.marktechpost.com/docs/getting-started.html">Getting began information</a></li>
<li><a href="https://www.marktechpost.com/docs/advanced-routing.html">Superior routing information</a></li>
<li><a href="https://www.marktechpost.com/weblog/crawling-at-scale.html">Crawling at scale article</a></li>
<li><a href="https://www.marktechpost.com/dynamic.html">JavaScript-rendered catalog</a></li>
<li><a href="https://www.marktechpost.com/admin/hidden.html">Admin web page blocked by robots and crawler filters</a></li>
</ul>
</part>
""",
),
)
for product in PRODUCTS:
related_links = "n".be part of(
f'<li><a category="related-link" href="https://www.marktechpost.com/merchandise/product-{safe_slug(sku)}.html">{sku}</a></li>'
for sku in product["related"]
)
feature_list = "n".be part of(f"<li>{characteristic}</li>" for characteristic in product["features"])
json_ld = json.dumps(
{
"@context": "https://schema.org",
"@sort": "Product",
"sku": product["sku"],
"title": product["name"],
"class": product["category"],
"gives": {
"@sort": "Provide",
"worth": product["price"],
"priceCurrency": "USD",
},
"aggregateRating": {
"@sort": "AggregateRating",
"ratingValue": product["rating"],
},
},
indent=2,
)
write_file(
SITE_DIR / "merchandise" / f"product-{safe_slug(product['sku'])}.html",
format(
f"{product['name']} | Product Element",
f"""
<article class="product"
data-sku="{product['sku']}"
data-category="{product['category']}"
data-rating="{product['rating']}"
data-stock="{product['stock']}">
<h2 class="product-title">{product['name']}</h2>
<p class="sku">SKU: <robust>{product['sku']}</robust></p>
<p class="class">Class: <robust>{product['category']}</robust></p>
<p class="worth" data-price="{product['price']}">${product['price']:.2f}</p>
<p class="score">Score: {product['rating']} / 5</p>
<p class="{'stock-low' if product['stock'] < 10 else 'stock-ok'}">Inventory: {product['stock']}</p>
<h3>Options</h3>
<ul class="options">{feature_list}</ul>
<h3>Associated modules</h3>
<ul>{related_links}</ul>
</article>
<script sort="utility/ld+json">{json_ld}</script>
""",
),
)
Home Artificial Intelligence Crawlee for Python: Construct an online crawling pipeline utilizing robotic processing, hyperlink graphs, and RAG chunk export
Crawlee for Python: Construct an online crawling pipeline utilizing robotic processing, hyperlink graphs, and RAG chunk export
by root

