Krawler 🕷️

A powerful, modern web crawling and scraping library for Kotlin Multiplatform. Build efficient web crawlers that run on JVM, Android, iOS, JavaScript, and WebAssembly with a beautiful Kotlin DSL.

✨ Features

🌍 True Multiplatform: Single codebase runs on JVM, Android, iOS, JS, and WASM
🎯 Intuitive Kotlin DSL: Configure crawlers with clean, type-safe syntax
🚀 High Performance: Concurrent crawling with coroutines and smart rate limiting
🔍 Flexible Extraction: CSS selectors, XPath, regex, and custom extractors
🤖 Robots.txt Compliance: Respects website crawling policies automatically
📊 Built-in Analytics: Track performance metrics and crawl statistics
🔌 Extensible Architecture: Clean architecture with pluggable components
💾 Smart Caching: Reduce redundant requests with intelligent caching
🎨 Sample App: Full-featured Compose Multiplatform demo application

📦 Installation

Multiplatform Project

Add Krawler to your build.gradle.kts:

kotlin {
    commonMain {
        dependencies {
            implementation("solutions.dreamforge.krawler:krawler:0.0.1")
        }
    }
}

Platform-Specific Projects

<details> <summary>JVM/Android</summary>

dependencies {
    implementation("solutions.dreamforge.krawler:krawler-jvm:0.0.1")
}

</details> <details> <summary>iOS</summary>

kotlin {
    ios {
        binaries {
            framework {
                baseName = "krawler"
            }
        }
    }
}

</details> <details> <summary>JavaScript</summary>

dependencies {
    implementation("solutions.dreamforge.krawler:krawler-js:0.0.1")
}

</details>

🚀 Quick Start

Basic Example

import solutions.dreamforge.krawler.*
import solutions.dreamforge.krawler.dsl.*

suspend fun main() {
    // Create a crawler instance
    val crawler = CrawlerSDK.create()
    
    // Define your crawl configuration
    val config = crawler {
        name = "My First Crawler"
        maxConcurrency = 10
        
        source("example") {
            urls("https://example.com")
            depth(2)
            
            extract {
                text("title", "h1")
                text("description", "meta[name=description]")
                links("links", "a[href]") {
                    multiple()
                }
            }
        }
    }
    
    // Start crawling and collect results
    crawler.crawl(config).collect { result ->
        when (result.status) {
            CrawlStatus.SUCCESS -> {
                println("Crawled: ${result.webPage?.url}")
                println("Title: ${result.webPage?.extractedData["title"]}")
            }
            else -> println("Failed: ${result.error}")
        }
    }
}

Advanced Configuration

val advancedConfig = crawler {
    name = "Advanced News Crawler"
    maxConcurrency = 20
    
    // Global extraction rules
    extract {
        text("title", "h1, h2, .headline") {
            required()
            process {
                trim()
                uppercase()
            }
        }
        
        html("content", "article, .post-content") {
            process {
                // Remove ads and scripts
                custom("clean-html")
            }
        }
        
        // Extract structured data
        regex("price", "\\$([0-9,]+\\.?[0-9]*)", group = 1)
    }
    
    // Global crawl policy
    policy {
        respectRobotsTxt = true
        delay(2000) // 2 seconds between requests
        userAgent = "MyNewsBot/1.0"
        maxRetries = 3
        timeout = 15000
        
        allowContentTypes("text/html", "application/xhtml+xml")
        
        headers {
            put("Accept-Language", "en-US,en;q=0.9")
            put("Accept-Encoding", "gzip, deflate")
        }
    }
    
    // Multiple sources with different configurations
    source("tech-news") {
        urls(
            "https://techcrunch.com",
            "https://theverge.com",
            "https://arstechnica.com"
        )
        depth(3)
        priority(CrawlRequest.Priority.HIGH)
        
        // Source-specific rules
        extract {
            text("author", ".author-name, .by-line")
            text("date", "time[datetime]")
        }
    }
    
    source("business-news") {
        urls("https://bloomberg.com", "https://ft.com")
        depth(2)
        priority(CrawlRequest.Priority.NORMAL)
        
        policy {
            delay(5000) // More conservative for premium sites
        }
    }
}

🔧 Platform Setup

JVM Configuration

val crawler = CrawlerSDK.create(
    SDKConfiguration(
        userAgent = "MyBot/1.0 (Compatible; JVM)",
        maxConcurrency = 50,
        connectTimeoutSeconds = 10,
        readTimeoutSeconds = 30
    )
)

Android Permissions

Add to your AndroidManifest.xml:

<uses-permission android:name="android.permission.INTERNET" />
<uses-permission android:name="android.permission.ACCESS_NETWORK_STATE" />

iOS Configuration

No special configuration required. The library uses native iOS networking APIs.

JavaScript/Browser

// Runs in browser with CORS limitations
val crawler = CrawlerSDK.create(
    SDKConfiguration(
        userAgent = "MyBot/1.0 (Compatible; Browser)",
        maxConcurrency = 10 // Limited by browser
    )
)

📚 Core Concepts

Crawl Request

The fundamental unit of crawling:

val request = CrawlRequest(
    id = "unique-id",
    url = "https://example.com",
    depth = 0,
    maxDepth = 3,
    extractionRules = listOf(/* ... */),
    crawlPolicy = CrawlPolicy(/* ... */),
    priority = CrawlRequest.Priority.HIGH,
    metadata = mapOf("category" to "tech"),
    timestamp = Clock.System.now()
)

Extraction Rules

Define what data to extract:

// CSS Selector
val titleRule = ExtractionRule(
    name = "title",
    selector = Selector.CssSelector("h1.main-title"),
    extractionType = ExtractionType.TEXT,
    required = true
)

// XPath
val priceRule = ExtractionRule(
    name = "price",
    selector = Selector.XPathSelector("//span[@class='price']/text()"),
    extractionType = ExtractionType.TEXT,
    postProcessors = listOf(
        PostProcessor.Extract("([0-9.]+)", 1),
        PostProcessor.Custom("parse-currency")
    )
)

// Regex
val emailRule = ExtractionRule(
    name = "emails",
    selector = Selector.RegexSelector("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}"),
    extractionType = ExtractionType.TEXT,
    multiple = true
)

Post Processors

Transform extracted data:

extract {
    text("price", ".price") {
        process {
            trim()
            replace("$", "")
            replace(",", "")
            custom("to-number")
        }
    }
    
    text("description", ".desc") {
        process {
            trim()
            substring(0, 200)
            custom("remove-html") { 
                // Configuration for custom processor
                put("preserve-links", "true")
            }
        }
    }
}

Crawl Policies

Control crawler behavior:

policy {
    respectRobotsTxt = true
    followRedirects = true
    maxRedirects = 5
    delayBetweenRequests = 1000 // milliseconds
    maxRetries = 3
    timeout = 30000
    maxContentLength = 10 * 1024 * 1024 // 10MB
    
    allowContentTypes(
        "text/html",
        "application/xhtml+xml",
        "application/xml"
    )
    
    headers {
        put("Accept", "text/html,application/xhtml+xml")
        put("Accept-Language", "en-US,en;q=0.9")
        put("Cache-Control", "no-cache")
    }
}

🔥 Advanced Usage

Batch Crawling

val requests = (1..100).map { page ->
    CrawlRequest(
        id = "page-$page",
        url = "https://example.com/products?page=$page",
        // ... other configuration
    )
}

crawler.batchCrawl(
    requests = requests,
    maxConcurrency = 20,
    batchId = "products-crawl"
).collect { result ->
    // Process results
}

Custom Extraction Engine

class MyCustomExtractor : ExtractionEngine {
    override suspend fun extract(
        html: String,
        rules: List<ExtractionRule>
    ): Map<String, ExtractedValue> {
        // Custom extraction logic
        return extractedData
    }
}

val crawler = CrawlerSDK.create(
    extractionEngine = MyCustomExtractor(),
    // ... other components
)

Progress Monitoring

val crawler = CrawlerSDK.create()

// Monitor statistics
launch {
    while (true) {
        val stats = crawler.getStats()
        println("""
            Active: ${stats.activeCrawls}
            Completed: ${stats.completedCrawls}
            Failed: ${stats.failedCrawls}
            Queue Size: ${stats.queueSize}
            Avg Response Time: ${stats.averageResponseTime}ms
        """.trimIndent())
        delay(1000)
    }
}

// Start crawling
crawler.crawl(config).collect { /* ... */ }

Error Handling

crawler.crawl(config).collect { result

Krawler

Install / Use

README