ΠŸΠ΅Ρ€Π΅ΠΉΡ‚ΠΈ Π΄ΠΎ змісту

Structured Data Extraction

Час читання: 12 Ρ…Π²ΠΈΠ»ΠΈΠ½
Π Ρ–Π²Π΅Π½ΡŒ: Π‘Π΅Ρ€Π΅Π΄Π½Ρ–ΠΉ

Витягування структурованих Π΄Π°Π½ΠΈΡ… (JSON-LD, Open Graph, Microdata, Twitter Cards, RDFa).


Огляд

StructuredDataPlugin витягує ΠΌΠ°ΡˆΠΈΠ½ΠΎΡ‡ΠΈΡ‚Π°Π½Ρ– Π΄Π°Π½Ρ– Π· HTML сторінок:

Π€ΠΎΡ€ΠΌΠ°Ρ‚ Опис ΠŸΡ€Ρ–ΠΎΡ€ΠΈΡ‚Π΅Ρ‚
JSON-LD schema.org (Ρ€Π΅ΠΊΠΎΠΌΠ΅Π½Π΄ΠΎΠ²Π°Π½ΠΈΠΉ Google) Високий
Open Graph Facebook/LinkedIn ΠΌΠ΅Ρ‚Π°Ρ‚Π΅Π³ΠΈ (og:*) Π‘Π΅Ρ€Π΅Π΄Π½Ρ–ΠΉ
Twitter Cards Twitter ΠΌΠ΅Ρ‚Π°Ρ‚Π΅Π³ΠΈ (twitter:*) Π‘Π΅Ρ€Π΅Π΄Π½Ρ–ΠΉ
Microdata HTML Π°Ρ‚Ρ€ΠΈΠ±ΡƒΡ‚ΠΈ (itemscope/itemprop) Низький
RDFa Π‘Π΅ΠΌΠ°Π½Ρ‚ΠΈΡ‡Π½ΠΈΠΉ Π²Π΅Π± (ΠΎΠΏΡ†Ρ–ΠΎΠ½Π°Π»ΡŒΠ½ΠΎ) Низький

БізнСс-Ρ†Ρ–Π½Π½Ρ–ΡΡ‚ΡŒ:

  • E-commerce: структуровані Product/Offer Π·Π°ΠΌΡ–ΡΡ‚ΡŒ regex
  • Job aggregation: JobPosting Π· salary, location
  • News: Article Π· author, datePublished
  • SEO audit: Π°Π²Ρ‚ΠΎΠΌΠ°Ρ‚ΠΈΡ‡Π½Π° валідація schema

Π‘Π°Π·ΠΎΠ²Π΅ використання

import graph_crawler as gc
from graph_crawler.extensions.plugins.node.structured_data import StructuredDataPlugin

graph = gc.crawl(
    "https://example.com",
    plugins=[StructuredDataPlugin()]
)

for node in graph:
    sd = node.user_data.get('structured_data')
    if sd and sd.has_data:
        print(f"\n{node.url}:")
        print(f"  Type: {sd.get_type()}")
        print(f"  Name: {sd.get_property('name')}")
        print(f"  Parse time: {sd.parse_time_ms:.1f}ms")

ΠΠ°Π»Π°ΡˆΡ‚ΡƒΠ²Π°Π½Π½Ρ

from graph_crawler.extensions.plugins.node.structured_data import (
    StructuredDataPlugin,
    StructuredDataOptions,
)

options = StructuredDataOptions(
    # Π€ΠΎΡ€ΠΌΠ°Ρ‚ΠΈ для парсингу
    parse_jsonld=True,           # JSON-LD (schema.org)
    parse_opengraph=True,        # Open Graph (og:*)
    parse_twitter=True,          # Twitter Cards
    parse_microdata=True,        # Microdata (itemscope)
    parse_rdfa=False,            # RDFa (Π²ΠΈΠΌΠΊΠ½Π΅Π½ΠΎ для ΡˆΠ²ΠΈΠ΄ΠΊΠΎΡΡ‚Ρ–)

    # Π€Ρ–Π»ΡŒΡ‚Ρ€Π°Ρ†Ρ–Ρ Ρ‚ΠΈΠΏΡ–Π² (None = всі)
    allowed_types=['Product', 'Offer', 'Organization'],

    # Π›Ρ–ΠΌΡ–Ρ‚ΠΈ Π±Π΅Π·ΠΏΠ΅ΠΊΠΈ
    max_jsonld_blocks=10,        # Макс. JSON-LD Π±Π»ΠΎΠΊΡ–Π²
    max_jsonld_size=100_000,     # Макс. Ρ€ΠΎΠ·ΠΌΡ–Ρ€ JSON-LD (bytes)
    max_microdata_items=50,      # Макс. microdata items
    max_nesting_depth=5,         # Макс. Π³Π»ΠΈΠ±ΠΈΠ½Π° вкладСності

    # Π’Π°ΠΉΠΌΠ°ΡƒΡ‚ΠΈ
    timeout_per_parser=2.0,      # Π‘Π΅ΠΊΡƒΠ½Π΄ Π½Π° парсСр

    # ΠžΠ±Ρ€ΠΎΠ±ΠΊΠ° ΠΏΠΎΠΌΠΈΠ»ΠΎΠΊ
    fail_silently=True,          # НС ΠΏΠ°Π΄Π°Ρ‚ΠΈ ΠΏΡ€ΠΈ ΠΏΠΎΠΌΠΈΠ»ΠΊΠ°Ρ…

    # ΠžΠΏΡ†Ρ–Ρ— парсингу
    include_nested=True,         # Π’ΠΊΠ»ΡŽΡ‡Π°Ρ‚ΠΈ Π²ΠΊΠ»Π°Π΄Π΅Π½Ρ– ΠΎΠ±'Ρ”ΠΊΡ‚ΠΈ
    normalize_types=True,        # "schema.org/Product" -> "Product"
)

graph = gc.crawl(
    "https://shop.example.com",
    plugins=[StructuredDataPlugin(options)]
)

Π ΠΎΠ±ΠΎΡ‚Π° Π· Π΄Π°Π½ΠΈΠΌΠΈ

StructuredDataResult

sd = node.user_data.get('structured_data')

# ΠŸΠ΅Ρ€Π΅Π²Ρ–Ρ€ΠΊΠ° наявності Π΄Π°Π½ΠΈΡ…
if sd.has_data:
    # Основний Ρ‚ΠΈΠΏ (Π· Π½Π°ΠΉΠ²ΠΈΡ‰ΠΈΠΌ ΠΏΡ€Ρ–ΠΎΡ€ΠΈΡ‚Π΅Ρ‚ΠΎΠΌ)
    print(sd.get_type())        # "Product"
    print(sd.primary_type)      # Аліас

    # ΠžΡ‚Ρ€ΠΈΠΌΠ°Ρ‚ΠΈ Π²Π»Π°ΡΡ‚ΠΈΠ²Ρ–ΡΡ‚ΡŒ (ΡˆΡƒΠΊΠ°Ρ” Π² усіх Π΄ΠΆΠ΅Ρ€Π΅Π»Π°Ρ…)
    print(sd.get_property('name'))
    print(sd.get_property('description'))
    print(sd.get_property('price'))

    # Всі ΠΎΠ±'Ρ”ΠΊΡ‚ΠΈ ΠΏΠ΅Π²Π½ΠΎΠ³ΠΎ Ρ‚ΠΈΠΏΡƒ
    products = sd.get_all_of_type('Product')
    for product in products:
        print(product.get('name'))
        print(product.get('offers', {}).get('price'))

# Бтатистика
print(f"JSON-LD blocks: {sd.jsonld_count}")
print(f"Microdata items: {sd.microdata_count}")
print(f"Parse time: {sd.parse_time_ms}ms")

# Помилки (якщо Π±ΡƒΠ»ΠΈ)
if sd.has_errors:
    print(f"Errors: {sd.errors}")

JSON-LD

# ΠŸΡ€ΡΠΌΠΈΠΉ доступ Π΄ΠΎ JSON-LD
json_ld = sd.json_ld  # List[dict]

for item in json_ld:
    print(f"@type: {item.get('@type')}")
    print(f"@context: {item.get('@context')}")
    print(f"name: {item.get('name')}")

    # Π’ΠΊΠ»Π°Π΄Π΅Π½Ρ– ΠΎΠ±'Ρ”ΠΊΡ‚ΠΈ
    if 'offers' in item:
        offer = item['offers']
        print(f"price: {offer.get('price')}")
        print(f"priceCurrency: {offer.get('priceCurrency')}")

Open Graph

# Open Graph Π΄Π°Π½Ρ– (dict)
og = sd.open_graph

print(f"og:title: {og.get('og:title')}")
print(f"og:description: {og.get('og:description')}")
print(f"og:image: {og.get('og:image')}")
print(f"og:url: {og.get('og:url')}")
print(f"og:type: {og.get('og:type')}")
print(f"og:site_name: {og.get('og:site_name')}")

Twitter Cards

# Twitter Cards (dict)
twitter = sd.twitter_cards

print(f"twitter:card: {twitter.get('twitter:card')}")
print(f"twitter:title: {twitter.get('twitter:title')}")
print(f"twitter:description: {twitter.get('twitter:description')}")
print(f"twitter:image: {twitter.get('twitter:image')}")
print(f"twitter:site: {twitter.get('twitter:site')}")

Microdata

# Microdata items (List[dict])
microdata = sd.microdata

for item in microdata:
    print(f"type: {item.get('type')}")
    print(f"properties: {item.get('properties')}")

RDFa

# RDFa (якщо ΡƒΠ²Ρ–ΠΌΠΊΠ½Π΅Π½ΠΎ)
rdfa = sd.rdfa  # List[dict]

ΠŸΡ€ΠΈΠΊΠ»Π°Π΄ΠΈ

E-commerce: ΠŸΡ€ΠΎΠ΄ΡƒΠΊΡ‚ΠΈ

from graph_crawler.extensions.plugins.node.structured_data import (
    StructuredDataPlugin,
    StructuredDataOptions,
)

options = StructuredDataOptions(
    allowed_types=['Product', 'Offer', 'AggregateOffer', 'AggregateRating'],
)

graph = gc.crawl(
    "https://shop.example.com",
    plugins=[StructuredDataPlugin(options)]
)

products = []
for node in graph:
    sd = node.user_data.get('structured_data')
    if sd:
        for product in sd.get_all_of_type('Product'):
            offers = product.get('offers', {})

            # ΠžΠ±Ρ€ΠΎΠ±ΠΊΠ° AggregateOffer
            if isinstance(offers, list):
                prices = [o.get('price') for o in offers if o.get('price')]
                price = min(prices) if prices else None
            else:
                price = offers.get('price')

            products.append({
                'url': node.url,
                'name': product.get('name'),
                'brand': product.get('brand', {}).get('name'),
                'price': price,
                'currency': offers.get('priceCurrency') if isinstance(offers, dict) else None,
                'availability': offers.get('availability') if isinstance(offers, dict) else None,
                'rating': product.get('aggregateRating', {}).get('ratingValue'),
                'review_count': product.get('aggregateRating', {}).get('reviewCount'),
            })

print(f"Found {len(products)} products")

# Експорт
import pandas as pd
df = pd.DataFrame(products)
df.to_csv('products.csv', index=False)

Новинні статті

options = StructuredDataOptions(
    allowed_types=['Article', 'NewsArticle', 'BlogPosting', 'WebPage'],
)

graph = gc.crawl(
    "https://news.example.com",
    plugins=[StructuredDataPlugin(options)]
)

articles = []
for node in graph:
    sd = node.user_data.get('structured_data')
    if sd:
        for article in sd.get_all_of_type('Article'):
            # Author ΠΌΠΎΠΆΠ΅ Π±ΡƒΡ‚ΠΈ string Π°Π±ΠΎ object
            author = article.get('author', {})
            if isinstance(author, dict):
                author_name = author.get('name')
            elif isinstance(author, list):
                author_name = author[0].get('name') if author else None
            else:
                author_name = str(author)

            articles.append({
                'url': node.url,
                'headline': article.get('headline'),
                'author': author_name,
                'datePublished': article.get('datePublished'),
                'dateModified': article.get('dateModified'),
                'image': article.get('image'),
                'publisher': article.get('publisher', {}).get('name'),
            })

print(f"Found {len(articles)} articles")

Job Postings

options = StructuredDataOptions(
    allowed_types=['JobPosting'],
)

graph = gc.crawl(
    "https://jobs.example.com",
    plugins=[StructuredDataPlugin(options)]
)

jobs = []
for node in graph:
    sd = node.user_data.get('structured_data')
    if sd:
        for job in sd.get_all_of_type('JobPosting'):
            salary = job.get('baseSalary', {})

            jobs.append({
                'url': node.url,
                'title': job.get('title'),
                'company': job.get('hiringOrganization', {}).get('name'),
                'location': job.get('jobLocation', {}).get('address', {}).get('addressLocality'),
                'salary_min': salary.get('value', {}).get('minValue') if isinstance(salary.get('value'), dict) else None,
                'salary_max': salary.get('value', {}).get('maxValue') if isinstance(salary.get('value'), dict) else None,
                'salary_currency': salary.get('currency'),
                'employment_type': job.get('employmentType'),
                'date_posted': job.get('datePosted'),
                'valid_through': job.get('validThrough'),
            })

print(f"Found {len(jobs)} job postings")

Social Media Preview

graph = gc.crawl(
    "https://example.com",
    plugins=[StructuredDataPlugin()]
)

for node in graph:
    sd = node.user_data.get('structured_data')
    if sd:
        og = sd.open_graph
        twitter = sd.twitter_cards

        # Fallback chain: OG -> Twitter -> JSON-LD
        title = (
            og.get('og:title') or
            twitter.get('twitter:title') or
            sd.get_property('name')
        )
        description = (
            og.get('og:description') or
            twitter.get('twitter:description') or
            sd.get_property('description')
        )
        image = (
            og.get('og:image') or
            twitter.get('twitter:image') or
            sd.get_property('image')
        )

        print(f"\n{node.url}:")
        print(f"  Title: {title}")
        print(f"  Description: {description[:100] if description else None}...")
        print(f"  Image: {image}")

SEO Audit

def audit_structured_data(node):
    """Аудит структурованих Π΄Π°Π½ΠΈΡ… для SEO."""
    sd = node.user_data.get('structured_data')
    issues = []

    if not sd or not sd.has_data:
        issues.append("No structured data found")
        return issues

    # JSON-LD check
    if not sd.json_ld:
        issues.append("No JSON-LD found (recommended by Google)")

    # Open Graph check
    og = sd.open_graph
    if not og.get('og:title'):
        issues.append("Missing og:title")
    if not og.get('og:description'):
        issues.append("Missing og:description")
    if not og.get('og:image'):
        issues.append("Missing og:image")

    # Twitter Cards check
    twitter = sd.twitter_cards
    if not twitter.get('twitter:card'):
        issues.append("Missing twitter:card")

    return issues

graph = gc.crawl(
    "https://example.com",
    plugins=[StructuredDataPlugin()]
)

for node in graph:
    issues = audit_structured_data(node)
    if issues:
        print(f"\n{node.url}:")
        for issue in issues:
            print(f"  ⚠️ {issue}")

ΠŸΠ°Ρ€ΡΠ΅Ρ€ΠΈ

GraphCrawler Π²ΠΊΠ»ΡŽΡ‡Π°Ρ” ΠΎΠΊΡ€Π΅ΠΌΡ– парсСри для ΠΊΠΎΠΆΠ½ΠΎΠ³ΠΎ Ρ„ΠΎΡ€ΠΌΠ°Ρ‚Ρƒ:

from graph_crawler.extensions.plugins.node.structured_data import (
    JsonLdParser,
    OpenGraphParser,
    TwitterCardsParser,
    MicrodataParser,
    RdfaParser,
)

# Standalone використання
parser = JsonLdParser()
json_ld_data = parser.parse(html_content)

og_parser = OpenGraphParser()
og_data = og_parser.parse(html_content)

SchemaType Enum

from graph_crawler.extensions.plugins.node.structured_data import SchemaType

# Π’ΠΈΠΏΠΎΠ²Ρ– schema.org Ρ‚ΠΈΠΏΠΈ
SchemaType.PRODUCT
SchemaType.OFFER
SchemaType.ARTICLE
SchemaType.ORGANIZATION
SchemaType.PERSON
SchemaType.EVENT
SchemaType.JOB_POSTING
# ... Ρ‚Π° Ρ–Π½ΡˆΡ–

ΠžΠ±Ρ€ΠΎΠ±ΠΊΠ° ΠΏΠΎΠΌΠΈΠ»ΠΎΠΊ

options = StructuredDataOptions(
    fail_silently=True,  # Π—Π° замовчуванням
)

graph = gc.crawl(url, plugins=[StructuredDataPlugin(options)])

for node in graph:
    sd = node.user_data.get('structured_data')

    # ΠŸΠ΅Ρ€Π΅Π²Ρ–Ρ€ΠΊΠ° Π½Π° ΠΏΠΎΠΌΠΈΠ»ΠΊΠΈ
    if sd.has_errors:
        print(f"Errors on {node.url}:")
        for error in sd.errors:
            print(f"  - {error}")

    # ΠŸΠ΅Ρ€Π΅Π²Ρ–Ρ€ΠΊΠ° Ρ‡ΠΈ Ρ€Π΅Π·ΡƒΠ»ΡŒΡ‚Π°Ρ‚ ΠΏΠΎΡ€ΠΎΠΆΠ½Ρ–ΠΉ Ρ‡Π΅Ρ€Π΅Π· ΠΏΠΎΠΌΠΈΠ»ΠΊΡƒ
    if sd.is_error_result:
        print(f"Failed to parse: {sd.error_message}")

Performance Tips

  1. Π’ΠΈΠΌΠΊΠ½Ρ–Ρ‚ΡŒ Π½Π΅ΠΏΠΎΡ‚Ρ€Ρ–Π±Π½Ρ– парсСри:
options = StructuredDataOptions(
    parse_rdfa=False,      # RDFa ΠΏΠΎΠ²Ρ–Π»ΡŒΠ½ΠΈΠΉ
    parse_microdata=False, # Π―ΠΊΡ‰ΠΎ Π½Π΅ ΠΏΠΎΡ‚Ρ€Ρ–Π±Π΅Π½
)
  1. ΠžΠ±ΠΌΠ΅ΠΆΡ‚Π΅ Ρ‚ΠΈΠΏΠΈ:
options = StructuredDataOptions(
    allowed_types=['Product'],  # Π’Ρ–Π»ΡŒΠΊΠΈ Product
)
  1. Π’ΡΡ‚Π°Π½ΠΎΠ²Ρ–Ρ‚ΡŒ Π»Ρ–ΠΌΡ–Ρ‚ΠΈ:
    options = StructuredDataOptions(
        max_jsonld_blocks=5,
        max_microdata_items=20,
        timeout_per_parser=1.0,
    )
    

Наступні ΠΊΡ€ΠΎΠΊΠΈ