Guides
- Tutorials
- Examples
Content Extraction Workflow
Build a complete content extraction workflow with Dumpling AI
Building a Content Extraction Workflow
This example demonstrates how to build a complete content extraction workflow using Dumpling AI. We’ll show how to:
- Scrape web content
- Extract the most important information
- Store the information in a knowledge base
- Create a search interface for the extracted content
Setup
First, let’s set up our environment and API key:
const axios = require('axios');
// Configuration
const API_KEY = 'YOUR_API_KEY';
const BASE_URL = 'https://app.dumplingai.com/api/v1';
const KNOWLEDGE_BASE_ID = 'YOUR_KNOWLEDGE_BASE_ID'; // Create this in the dashboard
// Helper function for API requests
async function callDumplingAPI(endpoint, data) {
try {
const response = await axios.post(`${BASE_URL}/${endpoint}`, data, {
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${API_KEY}`
}
});
return response.data;
} catch (error) {
console.error(`Error calling ${endpoint}:`, error.response ? error.response.data : error.message);
throw error;
}
}
Step 1: Scrape Web Content
First, let’s scrape content from a website:
async function scrapeWebsite(url) {
console.log(`Scraping website: ${url}`);
const result = await callDumplingAPI('scrape', {
url: url,
format: 'markdown',
cleaned: true,
renderJs: true
});
console.log(`Successfully scraped: ${result.title}`);
return result;
}
// Usage
const url = 'https://example.com/article';
const scrapedContent = await scrapeWebsite(url);
Step 2: Extract Key Information
Next, let’s use an AI agent to extract the most important information from the content:
async function extractKeyInformation(content, title) {
console.log(`Extracting key information from: ${title}`);
const prompt = `
Extract the most important information from the following content.
Format your response as JSON with the following structure:
{
"title": "The main title",
"summary": "A concise summary (max 150 words)",
"keyPoints": ["point 1", "point 2", "point 3"],
"entities": [{"name": "Entity name", "type": "person/organization/location/etc"}],
"categories": ["category1", "category2"]
}
Content:
${content}
`;
const result = await callDumplingAPI('agents/generate-completion', {
messages: [
{
role: 'user',
content: prompt
}
],
agentId: 'YOUR_AGENT_ID', // Replace with your agent ID
parseJson: true
});
console.log(`Successfully extracted information with ${result.parsedJson.keyPoints.length} key points`);
return result.parsedJson;
}
// Usage
const extractedInfo = await extractKeyInformation(scrapedContent.content, scrapedContent.title);
Step 3: Store in Knowledge Base
Now, let’s store the extracted information in a knowledge base:
async function storeInKnowledgeBase(extractedInfo, sourceUrl) {
console.log(`Storing information in knowledge base...`);
// Format content for the knowledge base
const formattedContent = `
# ${extractedInfo.title}
## Summary
${extractedInfo.summary}
## Key Points
${extractedInfo.keyPoints.map(point => `- ${point}`).join('\n')}
## Categories
${extractedInfo.categories.join(', ')}
`;
const result = await callDumplingAPI('knowledge-bases/add', {
knowledgeBaseId: KNOWLEDGE_BASE_ID,
content: formattedContent,
metadata: {
title: extractedInfo.title,
source: sourceUrl,
extractedAt: new Date().toISOString(),
categories: extractedInfo.categories,
entities: extractedInfo.entities
}
});
console.log(`Successfully stored in knowledge base with ID: ${result.id}`);
return result;
}
// Usage
const storedResult = await storeInKnowledgeBase(extractedInfo, url);
Step 4: Create a Search Interface
Finally, let’s create a simple function to search our knowledge base:
async function searchKnowledgeBase(query) {
console.log(`Searching knowledge base for: ${query}`);
const result = await callDumplingAPI('knowledge-bases/search', {
knowledgeBaseId: KNOWLEDGE_BASE_ID,
query: query,
maxResults: 5
});
console.log(`Found ${result.results.length} results for query: ${query}`);
// Format and display results
result.results.forEach((item, index) => {
console.log(`\nResult ${index + 1} (Score: ${item.score.toFixed(2)}):`);
console.log(`Title: ${item.metadata.title}`);
console.log(`Source: ${item.metadata.source}`);
console.log(`Extracted: ${new Date(item.metadata.extractedAt).toLocaleString()}`);
console.log(`Categories: ${item.metadata.categories.join(', ')}`);
console.log(`\nContent Preview: ${item.content.substring(0, 200)}...`);
});
return result.results;
}
// Usage
const searchResults = await searchKnowledgeBase("analytics dashboard features");
Putting It All Together
Now, let’s put everything together into a complete workflow:
async function contentExtractionWorkflow(urls) {
const processedUrls = [];
for (const url of urls) {
try {
console.log(`\n==== Processing URL: ${url} ====\n`);
// Step 1: Scrape website
const scrapedContent = await scrapeWebsite(url);
// Step 2: Extract key information
const extractedInfo = await extractKeyInformation(
scrapedContent.content,
scrapedContent.title
);
// Step 3: Store in knowledge base
const storedResult = await storeInKnowledgeBase(extractedInfo, url);
processedUrls.push({
url,
title: extractedInfo.title,
success: true,
knowledgeBaseItemId: storedResult.id
});
} catch (error) {
console.error(`Error processing URL ${url}:`, error);
processedUrls.push({
url,
success: false,
error: error.message
});
}
}
console.log(`\n==== Processing Complete ====\n`);
console.log(`Successfully processed ${processedUrls.filter(u => u.success).length} out of ${urls.length} URLs`);
return processedUrls;
}
// Example usage
const urlsToProcess = [
'https://example.com/article1',
'https://example.com/article2',
'https://example.com/blog/post1'
];
const results = await contentExtractionWorkflow(urlsToProcess);
// Search the processed content
const searchResults = await searchKnowledgeBase("machine learning trends");
Web Interface Example
Here’s a simple Express.js backend with a search endpoint:
const express = require('express');
const cors = require('cors');
const app = express();
const port = 3000;
app.use(cors());
app.use(express.json());
// Search endpoint
app.post('/api/search', async (req, res) => {
try {
const { query } = req.body;
if (!query) {
return res.status(400).json({ error: 'Query is required' });
}
const results = await searchKnowledgeBase(query);
res.json({
query,
results: results.map(item => ({
title: item.metadata.title,
source: item.metadata.source,
extractedAt: item.metadata.extractedAt,
categories: item.metadata.categories,
preview: item.content.substring(0, 200) + '...',
score: item.score
}))
});
} catch (error) {
console.error('Search error:', error);
res.status(500).json({ error: 'Search failed', details: error.message });
}
});
app.listen(port, () => {
console.log(`Server running on port ${port}`);
});
And a simple HTML frontend:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Knowledge Base Search</title>
<style>
body {
font-family: Arial, sans-serif;
max-width: 800px;
margin: 0 auto;
padding: 20px;
}
.search-box {
display: flex;
margin-bottom: 20px;
}
.search-box input {
flex-grow: 1;
padding: 10px;
font-size: 16px;
border: 1px solid #ddd;
border-radius: 4px 0 0 4px;
}
.search-box button {
padding: 10px 20px;
background: #000;
color: white;
border: none;
border-radius: 0 4px 4px 0;
cursor: pointer;
}
.result {
margin-bottom: 20px;
padding: 15px;
border: 1px solid #ddd;
border-radius: 4px;
}
.result-title {
margin-top: 0;
font-size: 18px;
}
.result-meta {
color: #666;
font-size: 14px;
margin-bottom: 10px;
}
.result-preview {
color: #333;
line-height: 1.5;
}
.categories {
display: flex;
flex-wrap: wrap;
gap: 5px;
}
.category {
background: #f0f0f0;
padding: 3px 8px;
border-radius: 12px;
font-size: 12px;
}
</style>
</head>
<body>
<h1>Knowledge Base Search</h1>
<div class="search-box">
<input type="text" id="search-input" placeholder="Search the knowledge base...">
<button id="search-button">Search</button>
</div>
<div id="results-container"></div>
<script>
document.getElementById('search-button').addEventListener('click', performSearch);
document.getElementById('search-input').addEventListener('keypress', function(e) {
if (e.key === 'Enter') {
performSearch();
}
});
async function performSearch() {
const query = document.getElementById('search-input').value.trim();
if (!query) return;
const resultsContainer = document.getElementById('results-container');
resultsContainer.innerHTML = '<p>Searching...</p>';
try {
const response = await fetch('http://localhost:3000/api/search', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ query })
});
const data = await response.json();
if (data.results.length === 0) {
resultsContainer.innerHTML = '<p>No results found.</p>';
return;
}
resultsContainer.innerHTML = '';
data.results.forEach(result => {
const resultElement = document.createElement('div');
resultElement.className = 'result';
const date = new Date(result.extractedAt).toLocaleDateString();
resultElement.innerHTML = `
<h3 class="result-title">${result.title}</h3>
<div class="result-meta">
<div>Source: <a href="${result.source}" target="_blank">${result.source}</a></div>
<div>Extracted: ${date}</div>
<div class="categories">
${result.categories.map(cat => `<span class="category">${cat}</span>`).join('')}
</div>
</div>
<p class="result-preview">${result.preview}</p>
`;
resultsContainer.appendChild(resultElement);
});
} catch (error) {
resultsContainer.innerHTML = `<p>Error: ${error.message}</p>`;
}
}
</script>
</body>
</html>
Conclusion
This example demonstrates a complete content extraction workflow using Dumpling AI:
- Scraping web content with the scrape endpoint
- Extracting key information with AI agents
- Storing and organizing the information in a knowledge base
- Creating a search interface to make the information easily accessible
You can extend this workflow by:
- Adding scheduled scraping of websites for regular updates
- Implementing content categorization and tagging
- Adding user authentication for personalized search experiences
- Creating visualizations of the extracted data
For more examples and ideas, check out our other guides and tutorials.