Krawler
Krawler - Kotlin Multiplatform Web Crawler Library
Install / Use
/learn @DreamForgeSolutions/KrawlerREADME
Krawler 🕷️
A powerful, modern web crawling and scraping library for Kotlin Multiplatform. Build efficient web crawlers that run on JVM, Android, iOS, JavaScript, and WebAssembly with a beautiful Kotlin DSL.
✨ Features
- 🌍 True Multiplatform: Single codebase runs on JVM, Android, iOS, JS, and WASM
- 🎯 Intuitive Kotlin DSL: Configure crawlers with clean, type-safe syntax
- 🚀 High Performance: Concurrent crawling with coroutines and smart rate limiting
- 🔍 Flexible Extraction: CSS selectors, XPath, regex, and custom extractors
- 🤖 Robots.txt Compliance: Respects website crawling policies automatically
- 📊 Built-in Analytics: Track performance metrics and crawl statistics
- 🔌 Extensible Architecture: Clean architecture with pluggable components
- 💾 Smart Caching: Reduce redundant requests with intelligent caching
- 🎨 Sample App: Full-featured Compose Multiplatform demo application
📋 Table of Contents
- Installation
- Quick Start
- Platform Setup
- Core Concepts
- Advanced Usage
- Architecture
- Sample Application
- API Reference
- Contributing
- License
📦 Installation
Multiplatform Project
Add Krawler to your build.gradle.kts:
kotlin {
commonMain {
dependencies {
implementation("solutions.dreamforge.krawler:krawler:0.0.1")
}
}
}
Platform-Specific Projects
<details> <summary>JVM/Android</summary>dependencies {
implementation("solutions.dreamforge.krawler:krawler-jvm:0.0.1")
}
</details>
<details>
<summary>iOS</summary>
kotlin {
ios {
binaries {
framework {
baseName = "krawler"
}
}
}
}
</details>
<details>
<summary>JavaScript</summary>
dependencies {
implementation("solutions.dreamforge.krawler:krawler-js:0.0.1")
}
</details>
🚀 Quick Start
Basic Example
import solutions.dreamforge.krawler.*
import solutions.dreamforge.krawler.dsl.*
suspend fun main() {
// Create a crawler instance
val crawler = CrawlerSDK.create()
// Define your crawl configuration
val config = crawler {
name = "My First Crawler"
maxConcurrency = 10
source("example") {
urls("https://example.com")
depth(2)
extract {
text("title", "h1")
text("description", "meta[name=description]")
links("links", "a[href]") {
multiple()
}
}
}
}
// Start crawling and collect results
crawler.crawl(config).collect { result ->
when (result.status) {
CrawlStatus.SUCCESS -> {
println("Crawled: ${result.webPage?.url}")
println("Title: ${result.webPage?.extractedData["title"]}")
}
else -> println("Failed: ${result.error}")
}
}
}
Advanced Configuration
val advancedConfig = crawler {
name = "Advanced News Crawler"
maxConcurrency = 20
// Global extraction rules
extract {
text("title", "h1, h2, .headline") {
required()
process {
trim()
uppercase()
}
}
html("content", "article, .post-content") {
process {
// Remove ads and scripts
custom("clean-html")
}
}
// Extract structured data
regex("price", "\\$([0-9,]+\\.?[0-9]*)", group = 1)
}
// Global crawl policy
policy {
respectRobotsTxt = true
delay(2000) // 2 seconds between requests
userAgent = "MyNewsBot/1.0"
maxRetries = 3
timeout = 15000
allowContentTypes("text/html", "application/xhtml+xml")
headers {
put("Accept-Language", "en-US,en;q=0.9")
put("Accept-Encoding", "gzip, deflate")
}
}
// Multiple sources with different configurations
source("tech-news") {
urls(
"https://techcrunch.com",
"https://theverge.com",
"https://arstechnica.com"
)
depth(3)
priority(CrawlRequest.Priority.HIGH)
// Source-specific rules
extract {
text("author", ".author-name, .by-line")
text("date", "time[datetime]")
}
}
source("business-news") {
urls("https://bloomberg.com", "https://ft.com")
depth(2)
priority(CrawlRequest.Priority.NORMAL)
policy {
delay(5000) // More conservative for premium sites
}
}
}
🔧 Platform Setup
JVM Configuration
val crawler = CrawlerSDK.create(
SDKConfiguration(
userAgent = "MyBot/1.0 (Compatible; JVM)",
maxConcurrency = 50,
connectTimeoutSeconds = 10,
readTimeoutSeconds = 30
)
)
Android Permissions
Add to your AndroidManifest.xml:
<uses-permission android:name="android.permission.INTERNET" />
<uses-permission android:name="android.permission.ACCESS_NETWORK_STATE" />
iOS Configuration
No special configuration required. The library uses native iOS networking APIs.
JavaScript/Browser
// Runs in browser with CORS limitations
val crawler = CrawlerSDK.create(
SDKConfiguration(
userAgent = "MyBot/1.0 (Compatible; Browser)",
maxConcurrency = 10 // Limited by browser
)
)
📚 Core Concepts
Crawl Request
The fundamental unit of crawling:
val request = CrawlRequest(
id = "unique-id",
url = "https://example.com",
depth = 0,
maxDepth = 3,
extractionRules = listOf(/* ... */),
crawlPolicy = CrawlPolicy(/* ... */),
priority = CrawlRequest.Priority.HIGH,
metadata = mapOf("category" to "tech"),
timestamp = Clock.System.now()
)
Extraction Rules
Define what data to extract:
// CSS Selector
val titleRule = ExtractionRule(
name = "title",
selector = Selector.CssSelector("h1.main-title"),
extractionType = ExtractionType.TEXT,
required = true
)
// XPath
val priceRule = ExtractionRule(
name = "price",
selector = Selector.XPathSelector("//span[@class='price']/text()"),
extractionType = ExtractionType.TEXT,
postProcessors = listOf(
PostProcessor.Extract("([0-9.]+)", 1),
PostProcessor.Custom("parse-currency")
)
)
// Regex
val emailRule = ExtractionRule(
name = "emails",
selector = Selector.RegexSelector("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}"),
extractionType = ExtractionType.TEXT,
multiple = true
)
Post Processors
Transform extracted data:
extract {
text("price", ".price") {
process {
trim()
replace("$", "")
replace(",", "")
custom("to-number")
}
}
text("description", ".desc") {
process {
trim()
substring(0, 200)
custom("remove-html") {
// Configuration for custom processor
put("preserve-links", "true")
}
}
}
}
Crawl Policies
Control crawler behavior:
policy {
respectRobotsTxt = true
followRedirects = true
maxRedirects = 5
delayBetweenRequests = 1000 // milliseconds
maxRetries = 3
timeout = 30000
maxContentLength = 10 * 1024 * 1024 // 10MB
allowContentTypes(
"text/html",
"application/xhtml+xml",
"application/xml"
)
headers {
put("Accept", "text/html,application/xhtml+xml")
put("Accept-Language", "en-US,en;q=0.9")
put("Cache-Control", "no-cache")
}
}
🔥 Advanced Usage
Batch Crawling
val requests = (1..100).map { page ->
CrawlRequest(
id = "page-$page",
url = "https://example.com/products?page=$page",
// ... other configuration
)
}
crawler.batchCrawl(
requests = requests,
maxConcurrency = 20,
batchId = "products-crawl"
).collect { result ->
// Process results
}
Custom Extraction Engine
class MyCustomExtractor : ExtractionEngine {
override suspend fun extract(
html: String,
rules: List<ExtractionRule>
): Map<String, ExtractedValue> {
// Custom extraction logic
return extractedData
}
}
val crawler = CrawlerSDK.create(
extractionEngine = MyCustomExtractor(),
// ... other components
)
Progress Monitoring
val crawler = CrawlerSDK.create()
// Monitor statistics
launch {
while (true) {
val stats = crawler.getStats()
println("""
Active: ${stats.activeCrawls}
Completed: ${stats.completedCrawls}
Failed: ${stats.failedCrawls}
Queue Size: ${stats.queueSize}
Avg Response Time: ${stats.averageResponseTime}ms
""".trimIndent())
delay(1000)
}
}
// Start crawling
crawler.crawl(config).collect { /* ... */ }
Error Handling
crawler.crawl(config).collect { result
