基于Playwright TypeScript/JavaScript的API调用爬虫成熟方案

本文详细介绍了如何使用 Playwright 和 TypeScript/JavaScript 构建成熟的 API 调用爬虫服务，涵盖基础架构、高级功能、生产级架构设计、性能优化、安全考虑、部署监控等完整解决方案，并提供了可直接使用的代码示例。

1. 基础API爬虫服务架构

1.1 Express + Playwright方案

这是一个基于Node.js和Express的轻量级API爬虫服务实现：

1
import express from 'express';
2
import { chromium, Browser, Page } from 'playwright';
3

4
const app = express();
5
app.use(express.json());
6

7
let browser: Browser;
8

9
// 初始化浏览器实例
10
async function initBrowser() {
11
    browser = await chromium.launch({
12
        headless: true,
13
        args: ['--no-sandbox']
14
    });
15
}
16

17
// 爬虫服务核心逻辑
18
async function scrapePage(url: string, options = {}) {
19
    const context = await browser.newContext();
20
    const page = await context.newPage();
21

22
    try {
23
        await page.goto(url, { waitUntil: 'networkidle' });
24

25
        // 可根据需求定制数据提取逻辑
26
        const data = await page.evaluate(() =&gt; {
27
            return {
28
                title: document.title,
29
                content: document.body.innerText,
30
                links: [...document.querySelectorAll('a')].map(a =&gt; a.href)
31
            };
32
        });
33

34
        return { success: true, data };
35
    } catch (error) {
36
        return { success: false, error: error.message };
37
    } finally {
38
        await page.close();
39
        await context.close();
40
    }
41
}
42

43
// API端点
44
app.post('/api/scrape', async (req, res) =&gt; {
45
    const { url } = req.body;
46
    if (!url) {
47
        return res.status(400).json({ error: 'URL is required' });
48
    }
49

50
    const result = await scrapePage(url);
51
    res.json(result);
52
});
53

54
// 启动服务
55
initBrowser().then(() =&gt; {
56
    app.listen(3000, () =&gt; {
57
        console.log('Scraper API running on http://localhost:3000');
58
    });
59
});
60

61
// 优雅关闭
62
process.on('SIGTERM', async () =&gt; {
63
    await browser.close();
64
    process.exit(0);
65
});

这个方案提供了以下特性：

基于Express的RESTful API接口
Playwright的无头浏览器实例管理
基本错误处理和资源清理
优雅的启动和关闭流程

2. 高级功能实现

2.1 支持动态参数和配置

1
interface ScrapeOptions {
2
    waitUntil?: 'load' | 'domcontentloaded' | 'networkidle';
3
    timeout?: number;
4
    headers?: Record&lt;string, string&gt;;
5
    screenshot?: boolean;
6
    pdf?: boolean;
7
    userAgent?: string;
8
}
9

10
async function scrapeWithOptions(url: string, options: ScrapeOptions = {}) {
11
    const context = await browser.newContext({
12
        userAgent: options.userAgent || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
13
    });
14

15
    const page = await context.newPage();
16
    if (options.headers) {
17
        await page.setExtraHTTPHeaders(options.headers);
18
    }
19

20
    try {
21
        await page.goto(url, {
22
            waitUntil: options.waitUntil || 'networkidle',
23
            timeout: options.timeout || 30000
24
        });
25

26
        const result: any = {
27
            title: await page.title(),
28
            url: page.url()
29
        };
30

31
        if (options.screenshot) {
32
            result.screenshot = await page.screenshot({ fullPage: true });
33
        }
34

35
        if (options.pdf) {
36
            result.pdf = await page.pdf();
37
        }
38

39
        return result;
40
    } finally {
41
        await page.close();
42
        await context.close();
43
    }
44
}

2.2 拦截网络请求优化性能

1
async function scrapeWithInterception(url: string) {
2
    const context = await browser.newContext();
3
    const page = await context.newPage();
4

5
    // 拦截不必要的资源请求
6
    await context.route('**/*.{png,jpg,jpeg,svg,gif,woff,woff2}', route =&gt; route.abort());
7

8
    // 监听API请求
9
    const apiResponses = [];
10
    page.on('response', async response =&gt; {
11
        if (response.url().includes('/api/')) {
12
            apiResponses.push({
13
                url: response.url(),
14
                status: response.status(),
15
                body: await response.json().catch(() =&gt; null)
16
            });
17
        }
18
    });
19

20
    await page.goto(url);
21

22
    return {
23
        pageContent: await page.content(),
24
        apiResponses
25
    };
26
}

3. 生产级架构方案

3.1 完整的爬虫服务架构

1
playwright-api/
2
├── src/
3
│   ├── config/               # 配置管理
4
│   │   └── browser.ts        # 浏览器配置
5
│   ├── controllers/          # API控制器
6
│   │   └── scrape.controller.ts
7
│   ├── services/             # 业务逻辑
8
│   │   ├── browser.service.ts # 浏览器管理
9
│   │   └── scrape.service.ts  # 爬虫逻辑
10
│   ├── routes/               # API路由
11
│   │   └── scrape.route.ts
12
│   ├── middlewares/          # 中间件
13
│   │   └── error.middleware.ts
14
│   └── index.ts              # 应用入口
15
├── test/                     # 测试
16
├── package.json
17
├── tsconfig.json
18
└── .env                      # 环境变量

3.2 浏览器服务管理

1
import { chromium, Browser, BrowserContext } from 'playwright';
2

3
class BrowserService {
4
    private browser: Browser | null = null;
5
    private contexts: BrowserContext[] = [];
6

7
    async launch() {
8
        if (this.browser) return;
9

10
        this.browser = await chromium.launch({
11
            headless: true,
12
            args: ['--no-sandbox']
13
        });
14
    }
15

16
    async newContext() {
17
        if (!this.browser) await this.launch();
18

19
        const context = await this.browser!.newContext();
20
        this.contexts.push(context);
21
        return context;
22
    }
23

24
    async close() {
25
        for (const context of this.contexts) {
26
            await context.close();
27
        }
28
        this.contexts = [];
29

30
        if (this.browser) {
31
            await this.browser.close();
32
            this.browser = null;
33
        }
34
    }
35
}
36

37
export const browserService = new BrowserService();

3.3 爬虫服务实现

1
import { browserService } from './browser.service';
2
import { Page } from 'playwright';
3

4
export class ScrapeService {
5
    async scrape(url: string, options: any = {}) {
6
        const context = await browserService.newContext();
7
        const page = await context.newPage();
8

9
        try {
10
            await page.goto(url, {
11
                waitUntil: options.waitUntil || 'networkidle',
12
                timeout: options.timeout || 30000
13
            });
14

15
            // 自定义数据提取逻辑
16
            const data = await this.extractData(page, options);
17
            return { success: true, data };
18
        } catch (error) {
19
            return { success: false, error: error.message };
20
        } finally {
21
            await page.close();
22
        }
23
    }
24

25
    private async extractData(page: Page, options: any) {
26
        // 实现具体的数据提取逻辑
27
        return {
28
            title: await page.title(),
29
            content: await page.content(),
30
            // 其他自定义数据
31
        };
32
    }
33
}

4. 性能优化与扩展

4.1 使用集群提高并发能力

1
import { Cluster } from 'playwright-cluster';
2

3
async function runCluster() {
4
    const cluster = await Cluster.launch({
5
        concurrency: Cluster.CONCURRENCY_CONTEXT,
6
        maxConcurrency: 4, // 根据CPU核心数调整
7
        playwrightOptions: {
8
            headless: true
9
        }
10
    });
11

12
    // 任务队列处理
13
    await cluster.task(async ({ page, data: url }) =&gt; {
14
        await page.goto(url);
15
        return await page.evaluate(() =&gt; document.title);
16
    });
17

18
    // 添加任务
19
    cluster.queue('https://example.com');
20
    cluster.queue('https://example.org');
21

22
    // 获取结果
23
    cluster.on('taskend', (result) =&gt; {
24
        console.log(`Title: ${result}`);
25
    });
26

27
    await cluster.idle();
28
    await cluster.close();
29
}

4.2 结合消息队列实现分布式爬取

1
import { Consumer } from 'sqs-consumer';
2
import AWS from 'aws-sdk';
3
import { scrapeService } from './services/scrape.service';
4

5
const app = Consumer.create({
6
    queueUrl: process.env.SQS_QUEUE_URL,
7
    handleMessage: async (message) =&gt; {
8
        const { url, options } = JSON.parse(message.Body!);
9
        const result = await scrapeService.scrape(url, options);
10

11
        // 处理结果，如存储到数据库或发送到另一个队列
12
        console.log(result);
13
    },
14
    sqs: new AWS.SQS()
15
});
16

17
app.on('error', (err) =&gt; {
18
    console.error(err.message);
19
});
20

21
app.on('processing_error', (err) =&gt; {
22
    console.error(err.message);
23
});
24

25
app.start();

5. 部署与监控

5.1 Docker部署方案

1
FROM node:16
2

3
WORKDIR /app
4
COPY package*.json ./
5
RUN npm install
6

7
COPY . .
8

9
# 安装Playwright依赖
10
RUN npx playwright install
11
RUN npx playwright install-deps
12

13
CMD ["node", "dist/index.js"]

5.2 使用PM2进行进程管理

pm2 start dist/index.js --name "playwright-api" -i max
pm2 save
pm2 startup

5.3 健康检查与监控

1
// 添加健康检查端点
2
app.get('/health', (req, res) =&gt; {
3
    res.json({
4
        status: 'UP',
5
        browser: browserService.isRunning(),
6
        timestamp: new Date().toISOString()
7
    });
8
});
9

10
// 添加Prometheus指标
11
import client from 'prom-client';
12
const collectDefaultMetrics = client.collectDefaultMetrics;
13
collectDefaultMetrics({ timeout: 5000 });
14

15
app.get('/metrics', async (req, res) =&gt; {
16
    res.set('Content-Type', client.register.contentType);
17
    res.end(await client.register.metrics());
18
});

6. 安全考虑

6.1 API认证

1
import passport from 'passport';
2
import { BasicStrategy } from 'passport-http';
3

4
passport.use(new BasicStrategy((username, password, done) =&gt; {
5
    if (username === process.env.API_USER && password === process.env.API_PASS) {
6
        return done(null, { user: 'api' });
7
    }
8
    return done(null, false);
9
}));
10

11
// 保护爬虫端点
12
app.post('/api/scrape',
13
    passport.authenticate('basic', { session: false }),
14
    scrapeController.scrape
15
);

6.2 请求限流

1
import rateLimit from 'express-rate-limit';
2

3
const limiter = rateLimit({
4
    windowMs: 15 * 60 * 1000, // 15分钟
5
    max: 100, // 每个IP限制100个请求
6
    message: 'Too many requests from this IP, please try again later'
7
});
8

9
app.use(limiter);

7. 测试与调试

7.1 单元测试示例

1
import { test, expect } from '@playwright/test';
2
import { scrapeService } from '../src/services/scrape.service';
3

4
test.describe('ScrapeService', () =&gt; {
5
    test('should return page title', async () =&gt; {
6
        const result = await scrapeService.scrape('https://example.com');
7
        expect(result.success).toBe(true);
8
        expect(result.data.title).toContain('Example');
9
    });
10
});

7.2 调试技巧

1
// 启用调试模式
2
const browser = await chromium.launch({
3
    headless: false,
4
    devtools: true
5
});
6

7
// 监听控制台输出
8
page.on('console', msg =&gt; {
9
    console.log('Browser console:', msg.text());
10
});
11

12
// 捕获网络请求
13
page.on('request', request =&gt; console.log('&gt;&gt;', request.method(), request.url()));
14
page.on('response', response =&gt; console.log('&lt;&lt;', response.status(), response.url()));

总结

以上方案提供了基于Playwright TypeScript/JavaScript实现API调用爬虫的完整路径，从基础实现到生产级架构，涵盖了：

基础API服务：Express与Playwright的简单集成
高级功能：请求拦截、动态参数支持、多种输出格式
生产架构：模块化设计、错误处理、资源管理
性能扩展：集群支持、消息队列集成
部署运维：Docker容器化、进程管理、监控
安全保障：API认证、请求限流
测试调试：单元测试、调试技巧

这些方案可以根据实际需求进行组合和调整，构建出适合不同场景的爬虫API服务。对于需要更高性能或更复杂业务逻辑的场景，可以考虑进一步引入分布式任务队列、缓存机制等高级架构。