Building a Content Extraction Workflow

This example demonstrates how to build a complete content extraction workflow using DumplingAI. We’ll show how to:

Scrape web content
Extract the most important information
Store the information in a knowledge base
Create a search interface for the extracted content

Setup

First, let’s set up our environment and API key:

const axios = require('axios');

// Configuration
const API_KEY = 'YOUR_API_KEY';
const BASE_URL = 'https://app.dumplingai.com/api/v1';
const KNOWLEDGE_BASE_ID = 'YOUR_KNOWLEDGE_BASE_ID'; // Create this in the dashboard

// Helper function for API requests
async function callDumplingAPI(endpoint, data) {
  try {
    const response = await axios.post(`${BASE_URL}/${endpoint}`, data, {
      headers: {
        'Content-Type': 'application/json',
        'Authorization': `Bearer ${API_KEY}`
      }
    });
    return response.data;
  } catch (error) {
    console.error(`Error calling ${endpoint}:`, error.response ? error.response.data : error.message);
    throw error;
  }
}

Step 1: Scrape Web Content

First, let’s scrape content from a website:

async function scrapeWebsite(url) {
  console.log(`Scraping website: ${url}`);
  
  const result = await callDumplingAPI('scrape', {
    url: url,
    format: 'markdown',
    cleaned: true,
    renderJs: true
  });
  
  console.log(`Successfully scraped: ${result.title}`);
  return result;
}

// Usage
const url = 'https://example.com/article';
const scrapedContent = await scrapeWebsite(url);

Step 2: Extract Key Information

Next, let’s use an AI agent to extract the most important information from the content:

async function extractKeyInformation(content, title) {
  console.log(`Extracting key information from: ${title}`);
  
  const prompt = `
    Extract the most important information from the following content. 
    Format your response as JSON with the following structure:
    {
      "title": "The main title",
      "summary": "A concise summary (max 150 words)",
      "keyPoints": ["point 1", "point 2", "point 3"],
      "entities": [{"name": "Entity name", "type": "person/organization/location/etc"}],
      "categories": ["category1", "category2"]
    }
    
    Content:
    ${content}
  `;
  
  const result = await callDumplingAPI('agents/generate-completion', {
    messages: [
      {
        role: 'user',
        content: prompt
      }
    ],
    agentId: 'YOUR_AGENT_ID', // Replace with your agent ID
    parseJson: true
  });
  
  console.log(`Successfully extracted information with ${result.parsedJson.keyPoints.length} key points`);
  return result.parsedJson;
}

// Usage
const extractedInfo = await extractKeyInformation(scrapedContent.content, scrapedContent.title);

Step 3: Store in Knowledge Base

Now, let’s store the extracted information in a knowledge base:

async function storeInKnowledgeBase(extractedInfo, sourceUrl) {
  console.log(`Storing information in knowledge base...`);
  
  // Format content for the knowledge base
  const formattedContent = `
    # ${extractedInfo.title}
    
    ## Summary
    ${extractedInfo.summary}
    
    ## Key Points
    ${extractedInfo.keyPoints.map(point => `- ${point}`).join('\n')}
    
    ## Categories
    ${extractedInfo.categories.join(', ')}
  `;
  
  const result = await callDumplingAPI('knowledge-bases/add', {
    knowledgeBaseId: KNOWLEDGE_BASE_ID,
    content: formattedContent,
    metadata: {
      title: extractedInfo.title,
      source: sourceUrl,
      extractedAt: new Date().toISOString(),
      categories: extractedInfo.categories,
      entities: extractedInfo.entities
    }
  });
  
  console.log(`Successfully stored in knowledge base with ID: ${result.id}`);
  return result;
}

// Usage
const storedResult = await storeInKnowledgeBase(extractedInfo, url);

Step 4: Create a Search Interface

Finally, let’s create a simple function to search our knowledge base:

async function searchKnowledgeBase(query) {
  console.log(`Searching knowledge base for: ${query}`);
  
  const result = await callDumplingAPI('knowledge-bases/query', {
    knowledgeBaseId: KNOWLEDGE_BASE_ID,
    query: query,
    resultCount: 5
  });
  
  console.log(`Found ${result.results.length} results for query: ${query}`);
  
  // Format and display results
  result.results.forEach((item, index) => {
    console.log(`\nResult ${index + 1} (Score: ${item.score.toFixed(2)}):`);
    console.log(`Title: ${item.metadata.title}`);
    console.log(`Source: ${item.metadata.source}`);
    console.log(`Extracted: ${new Date(item.metadata.extractedAt).toLocaleString()}`);
    console.log(`Categories: ${item.metadata.categories.join(', ')}`);
    console.log(`\nContent Preview: ${item.content.substring(0, 200)}...`);
  });
  
  return result.results;
}

// Usage
const searchResults = await searchKnowledgeBase("analytics dashboard features");

Putting It All Together

Now, let’s put everything together into a complete workflow:

async function contentExtractionWorkflow(urls) {
  const processedUrls = [];
  
  for (const url of urls) {
    try {
      console.log(`\n==== Processing URL: ${url} ====\n`);
      
      // Step 1: Scrape website
      const scrapedContent = await scrapeWebsite(url);
      
      // Step 2: Extract key information
      const extractedInfo = await extractKeyInformation(
        scrapedContent.content, 
        scrapedContent.title
      );
      
      // Step 3: Store in knowledge base
      const storedResult = await storeInKnowledgeBase(extractedInfo, url);
      
      processedUrls.push({
        url,
        title: extractedInfo.title,
        success: true,
        knowledgeBaseItemId: storedResult.id
      });
      
    } catch (error) {
      console.error(`Error processing URL ${url}:`, error);
      processedUrls.push({
        url,
        success: false,
        error: error.message
      });
    }
  }
  
  console.log(`\n==== Processing Complete ====\n`);
  console.log(`Successfully processed ${processedUrls.filter(u => u.success).length} out of ${urls.length} URLs`);
  
  return processedUrls;
}

// Example usage
const urlsToProcess = [
  'https://example.com/article1',
  'https://example.com/article2',
  'https://example.com/blog/post1'
];

const results = await contentExtractionWorkflow(urlsToProcess);

// Search the processed content
const searchResults = await searchKnowledgeBase("machine learning trends");

Web Interface Example

Here’s a simple Express.js backend with a search endpoint:

const express = require('express');
const cors = require('cors');
const app = express();
const port = 3000;

app.use(cors());
app.use(express.json());

// Search endpoint
app.post('/api/search', async (req, res) => {
  try {
    const { query } = req.body;
    
    if (!query) {
      return res.status(400).json({ error: 'Query is required' });
    }
    
    const results = await searchKnowledgeBase(query);
    
    res.json({
      query,
      results: results.map(item => ({
        title: item.metadata.title,
        source: item.metadata.source,
        extractedAt: item.metadata.extractedAt,
        categories: item.metadata.categories,
        preview: item.content.substring(0, 200) + '...',
        score: item.score
      }))
    });
  } catch (error) {
    console.error('Search error:', error);
    res.status(500).json({ error: 'Search failed', details: error.message });
  }
});

app.listen(port, () => {
  console.log(`Server running on port ${port}`);
});

And a simple HTML frontend:

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Knowledge Base Search</title>
  <style>
    body {
      font-family: Arial, sans-serif;
      max-width: 800px;
      margin: 0 auto;
      padding: 20px;
    }
    .search-box {
      display: flex;
      margin-bottom: 20px;
    }
    .search-box input {
      flex-grow: 1;
      padding: 10px;
      font-size: 16px;
      border: 1px solid #ddd;
      border-radius: 4px 0 0 4px;
    }
    .search-box button {
      padding: 10px 20px;
      background: #000;
      color: white;
      border: none;
      border-radius: 0 4px 4px 0;
      cursor: pointer;
    }
    .result {
      margin-bottom: 20px;
      padding: 15px;
      border: 1px solid #ddd;
      border-radius: 4px;
    }
    .result-title {
      margin-top: 0;
      font-size: 18px;
    }
    .result-meta {
      color: #666;
      font-size: 14px;
      margin-bottom: 10px;
    }
    .result-preview {
      color: #333;
      line-height: 1.5;
    }
    .categories {
      display: flex;
      flex-wrap: wrap;
      gap: 5px;
    }
    .category {
      background: #f0f0f0;
      padding: 3px 8px;
      border-radius: 12px;
      font-size: 12px;
    }
  </style>
</head>
<body>
  <h1>Knowledge Base Search</h1>
  
  <div class="search-box">
    <input type="text" id="search-input" placeholder="Search the knowledge base...">
    <button id="search-button">Search</button>
  </div>
  
  <div id="results-container"></div>
  
  <script>
    document.getElementById('search-button').addEventListener('click', performSearch);
    document.getElementById('search-input').addEventListener('keypress', function(e) {
      if (e.key === 'Enter') {
        performSearch();
      }
    });
    
    async function performSearch() {
      const query = document.getElementById('search-input').value.trim();
      
      if (!query) return;
      
      const resultsContainer = document.getElementById('results-container');
      resultsContainer.innerHTML = '<p>Searching...</p>';
      
      try {
        const response = await fetch('http://localhost:3000/api/search', {
          method: 'POST',
          headers: {
            'Content-Type': 'application/json'
          },
          body: JSON.stringify({ query })
        });
        
        const data = await response.json();
        
        if (data.results.length === 0) {
          resultsContainer.innerHTML = '<p>No results found.</p>';
          return;
        }
        
        resultsContainer.innerHTML = '';
        
        data.results.forEach(result => {
          const resultElement = document.createElement('div');
          resultElement.className = 'result';
          
          const date = new Date(result.extractedAt).toLocaleDateString();
          
          resultElement.innerHTML = `
            <h3 class="result-title">${result.title}</h3>
            <div class="result-meta">
              <div>Source: <a href="${result.source}" target="_blank">${result.source}</a></div>
              <div>Extracted: ${date}</div>
              <div class="categories">
                ${result.categories.map(cat => `<span class="category">${cat}</span>`).join('')}
              </div>
            </div>
            <p class="result-preview">${result.preview}</p>
          `;
          
          resultsContainer.appendChild(resultElement);
        });
      } catch (error) {
        resultsContainer.innerHTML = `<p>Error: ${error.message}</p>`;
      }
    }
  </script>
</body>
</html>

Conclusion

This example demonstrates a complete content extraction workflow using DumplingAI:

Scraping web content with the scrape endpoint
Extracting key information with AI agents
Storing and organizing the information in a knowledge base
Creating a search interface to make the information easily accessible

You can extend this workflow by:

Adding scheduled scraping of websites for regular updates
Implementing content categorization and tagging
Adding user authentication for personalized search experiences
Creating visualizations of the extracted data

For more examples and ideas, check out our other guides and tutorials.

Guides

​Building a Content Extraction Workflow

​Setup

​Step 1: Scrape Web Content

​Step 2: Extract Key Information

​Step 3: Store in Knowledge Base

​Step 4: Create a Search Interface

​Putting It All Together

​Web Interface Example

​Conclusion