Structured Data Extraction¶
Π§Π°Ρ ΡΠΈΡΠ°Π½Π½Ρ: 12 Ρ Π²ΠΈΠ»ΠΈΠ½
Π ΡΠ²Π΅Π½Ρ: Π‘Π΅ΡΠ΅Π΄Π½ΡΠΉ
ΠΠΈΡΡΠ³ΡΠ²Π°Π½Π½Ρ ΡΡΡΡΠΊΡΡΡΠΎΠ²Π°Π½ΠΈΡ Π΄Π°Π½ΠΈΡ (JSON-LD, Open Graph, Microdata, Twitter Cards, RDFa).
ΠΠ³Π»ΡΠ΄¶
StructuredDataPlugin Π²ΠΈΡΡΠ³ΡΡ ΠΌΠ°ΡΠΈΠ½ΠΎΡΠΈΡΠ°Π½Ρ Π΄Π°Π½Ρ Π· HTML ΡΡΠΎΡΡΠ½ΠΎΠΊ:
| Π€ΠΎΡΠΌΠ°Ρ | ΠΠΏΠΈΡ | ΠΡΡΠΎΡΠΈΡΠ΅Ρ |
|---|---|---|
| JSON-LD | schema.org (ΡΠ΅ΠΊΠΎΠΌΠ΅Π½Π΄ΠΎΠ²Π°Π½ΠΈΠΉ Google) | ΠΠΈΡΠΎΠΊΠΈΠΉ |
| Open Graph | Facebook/LinkedIn ΠΌΠ΅ΡΠ°ΡΠ΅Π³ΠΈ (og:*) | Π‘Π΅ΡΠ΅Π΄Π½ΡΠΉ |
| Twitter Cards | Twitter ΠΌΠ΅ΡΠ°ΡΠ΅Π³ΠΈ (twitter:*) | Π‘Π΅ΡΠ΅Π΄Π½ΡΠΉ |
| Microdata | HTML Π°ΡΡΠΈΠ±ΡΡΠΈ (itemscope/itemprop) | ΠΠΈΠ·ΡΠΊΠΈΠΉ |
| RDFa | Π‘Π΅ΠΌΠ°Π½ΡΠΈΡΠ½ΠΈΠΉ Π²Π΅Π± (ΠΎΠΏΡΡΠΎΠ½Π°Π»ΡΠ½ΠΎ) | ΠΠΈΠ·ΡΠΊΠΈΠΉ |
ΠΡΠ·Π½Π΅Ρ-ΡΡΠ½Π½ΡΡΡΡ:
- E-commerce: ΡΡΡΡΠΊΡΡΡΠΎΠ²Π°Π½Ρ Product/Offer Π·Π°ΠΌΡΡΡΡ regex
- Job aggregation: JobPosting Π· salary, location
- News: Article Π· author, datePublished
- SEO audit: Π°Π²ΡΠΎΠΌΠ°ΡΠΈΡΠ½Π° Π²Π°Π»ΡΠ΄Π°ΡΡΡ schema
ΠΠ°Π·ΠΎΠ²Π΅ Π²ΠΈΠΊΠΎΡΠΈΡΡΠ°Π½Π½Ρ¶
import graph_crawler as gc
from graph_crawler.extensions.plugins.node.structured_data import StructuredDataPlugin
graph = gc.crawl(
"https://example.com",
plugins=[StructuredDataPlugin()]
)
for node in graph:
sd = node.user_data.get('structured_data')
if sd and sd.has_data:
print(f"\n{node.url}:")
print(f" Type: {sd.get_type()}")
print(f" Name: {sd.get_property('name')}")
print(f" Parse time: {sd.parse_time_ms:.1f}ms")
ΠΠ°Π»Π°ΡΡΡΠ²Π°Π½Π½Ρ¶
from graph_crawler.extensions.plugins.node.structured_data import (
StructuredDataPlugin,
StructuredDataOptions,
)
options = StructuredDataOptions(
# Π€ΠΎΡΠΌΠ°ΡΠΈ Π΄Π»Ρ ΠΏΠ°ΡΡΠΈΠ½Π³Ρ
parse_jsonld=True, # JSON-LD (schema.org)
parse_opengraph=True, # Open Graph (og:*)
parse_twitter=True, # Twitter Cards
parse_microdata=True, # Microdata (itemscope)
parse_rdfa=False, # RDFa (Π²ΠΈΠΌΠΊΠ½Π΅Π½ΠΎ Π΄Π»Ρ ΡΠ²ΠΈΠ΄ΠΊΠΎΡΡΡ)
# Π€ΡΠ»ΡΡΡΠ°ΡΡΡ ΡΠΈΠΏΡΠ² (None = Π²ΡΡ)
allowed_types=['Product', 'Offer', 'Organization'],
# ΠΡΠΌΡΡΠΈ Π±Π΅Π·ΠΏΠ΅ΠΊΠΈ
max_jsonld_blocks=10, # ΠΠ°ΠΊΡ. JSON-LD Π±Π»ΠΎΠΊΡΠ²
max_jsonld_size=100_000, # ΠΠ°ΠΊΡ. ΡΠΎΠ·ΠΌΡΡ JSON-LD (bytes)
max_microdata_items=50, # ΠΠ°ΠΊΡ. microdata items
max_nesting_depth=5, # ΠΠ°ΠΊΡ. Π³Π»ΠΈΠ±ΠΈΠ½Π° Π²ΠΊΠ»Π°Π΄Π΅Π½ΠΎΡΡΡ
# Π’Π°ΠΉΠΌΠ°ΡΡΠΈ
timeout_per_parser=2.0, # Π‘Π΅ΠΊΡΠ½Π΄ Π½Π° ΠΏΠ°ΡΡΠ΅Ρ
# ΠΠ±ΡΠΎΠ±ΠΊΠ° ΠΏΠΎΠΌΠΈΠ»ΠΎΠΊ
fail_silently=True, # ΠΠ΅ ΠΏΠ°Π΄Π°ΡΠΈ ΠΏΡΠΈ ΠΏΠΎΠΌΠΈΠ»ΠΊΠ°Ρ
# ΠΠΏΡΡΡ ΠΏΠ°ΡΡΠΈΠ½Π³Ρ
include_nested=True, # ΠΠΊΠ»ΡΡΠ°ΡΠΈ Π²ΠΊΠ»Π°Π΄Π΅Π½Ρ ΠΎΠ±'ΡΠΊΡΠΈ
normalize_types=True, # "schema.org/Product" -> "Product"
)
graph = gc.crawl(
"https://shop.example.com",
plugins=[StructuredDataPlugin(options)]
)
Π ΠΎΠ±ΠΎΡΠ° Π· Π΄Π°Π½ΠΈΠΌΠΈ¶
StructuredDataResult¶
sd = node.user_data.get('structured_data')
# ΠΠ΅ΡΠ΅Π²ΡΡΠΊΠ° Π½Π°ΡΠ²Π½ΠΎΡΡΡ Π΄Π°Π½ΠΈΡ
if sd.has_data:
# ΠΡΠ½ΠΎΠ²Π½ΠΈΠΉ ΡΠΈΠΏ (Π· Π½Π°ΠΉΠ²ΠΈΡΠΈΠΌ ΠΏΡΡΠΎΡΠΈΡΠ΅ΡΠΎΠΌ)
print(sd.get_type()) # "Product"
print(sd.primary_type) # ΠΠ»ΡΠ°Ρ
# ΠΡΡΠΈΠΌΠ°ΡΠΈ Π²Π»Π°ΡΡΠΈΠ²ΡΡΡΡ (ΡΡΠΊΠ°Ρ Π² ΡΡΡΡ
Π΄ΠΆΠ΅ΡΠ΅Π»Π°Ρ
)
print(sd.get_property('name'))
print(sd.get_property('description'))
print(sd.get_property('price'))
# ΠΡΡ ΠΎΠ±'ΡΠΊΡΠΈ ΠΏΠ΅Π²Π½ΠΎΠ³ΠΎ ΡΠΈΠΏΡ
products = sd.get_all_of_type('Product')
for product in products:
print(product.get('name'))
print(product.get('offers', {}).get('price'))
# Π‘ΡΠ°ΡΠΈΡΡΠΈΠΊΠ°
print(f"JSON-LD blocks: {sd.jsonld_count}")
print(f"Microdata items: {sd.microdata_count}")
print(f"Parse time: {sd.parse_time_ms}ms")
# ΠΠΎΠΌΠΈΠ»ΠΊΠΈ (ΡΠΊΡΠΎ Π±ΡΠ»ΠΈ)
if sd.has_errors:
print(f"Errors: {sd.errors}")
JSON-LD¶
# ΠΡΡΠΌΠΈΠΉ Π΄ΠΎΡΡΡΠΏ Π΄ΠΎ JSON-LD
json_ld = sd.json_ld # List[dict]
for item in json_ld:
print(f"@type: {item.get('@type')}")
print(f"@context: {item.get('@context')}")
print(f"name: {item.get('name')}")
# ΠΠΊΠ»Π°Π΄Π΅Π½Ρ ΠΎΠ±'ΡΠΊΡΠΈ
if 'offers' in item:
offer = item['offers']
print(f"price: {offer.get('price')}")
print(f"priceCurrency: {offer.get('priceCurrency')}")
Open Graph¶
# Open Graph Π΄Π°Π½Ρ (dict)
og = sd.open_graph
print(f"og:title: {og.get('og:title')}")
print(f"og:description: {og.get('og:description')}")
print(f"og:image: {og.get('og:image')}")
print(f"og:url: {og.get('og:url')}")
print(f"og:type: {og.get('og:type')}")
print(f"og:site_name: {og.get('og:site_name')}")
Twitter Cards¶
# Twitter Cards (dict)
twitter = sd.twitter_cards
print(f"twitter:card: {twitter.get('twitter:card')}")
print(f"twitter:title: {twitter.get('twitter:title')}")
print(f"twitter:description: {twitter.get('twitter:description')}")
print(f"twitter:image: {twitter.get('twitter:image')}")
print(f"twitter:site: {twitter.get('twitter:site')}")
Microdata¶
# Microdata items (List[dict])
microdata = sd.microdata
for item in microdata:
print(f"type: {item.get('type')}")
print(f"properties: {item.get('properties')}")
RDFa¶
ΠΡΠΈΠΊΠ»Π°Π΄ΠΈ¶
E-commerce: ΠΡΠΎΠ΄ΡΠΊΡΠΈ¶
from graph_crawler.extensions.plugins.node.structured_data import (
StructuredDataPlugin,
StructuredDataOptions,
)
options = StructuredDataOptions(
allowed_types=['Product', 'Offer', 'AggregateOffer', 'AggregateRating'],
)
graph = gc.crawl(
"https://shop.example.com",
plugins=[StructuredDataPlugin(options)]
)
products = []
for node in graph:
sd = node.user_data.get('structured_data')
if sd:
for product in sd.get_all_of_type('Product'):
offers = product.get('offers', {})
# ΠΠ±ΡΠΎΠ±ΠΊΠ° AggregateOffer
if isinstance(offers, list):
prices = [o.get('price') for o in offers if o.get('price')]
price = min(prices) if prices else None
else:
price = offers.get('price')
products.append({
'url': node.url,
'name': product.get('name'),
'brand': product.get('brand', {}).get('name'),
'price': price,
'currency': offers.get('priceCurrency') if isinstance(offers, dict) else None,
'availability': offers.get('availability') if isinstance(offers, dict) else None,
'rating': product.get('aggregateRating', {}).get('ratingValue'),
'review_count': product.get('aggregateRating', {}).get('reviewCount'),
})
print(f"Found {len(products)} products")
# ΠΠΊΡΠΏΠΎΡΡ
import pandas as pd
df = pd.DataFrame(products)
df.to_csv('products.csv', index=False)
ΠΠΎΠ²ΠΈΠ½Π½Ρ ΡΡΠ°ΡΡΡ¶
options = StructuredDataOptions(
allowed_types=['Article', 'NewsArticle', 'BlogPosting', 'WebPage'],
)
graph = gc.crawl(
"https://news.example.com",
plugins=[StructuredDataPlugin(options)]
)
articles = []
for node in graph:
sd = node.user_data.get('structured_data')
if sd:
for article in sd.get_all_of_type('Article'):
# Author ΠΌΠΎΠΆΠ΅ Π±ΡΡΠΈ string Π°Π±ΠΎ object
author = article.get('author', {})
if isinstance(author, dict):
author_name = author.get('name')
elif isinstance(author, list):
author_name = author[0].get('name') if author else None
else:
author_name = str(author)
articles.append({
'url': node.url,
'headline': article.get('headline'),
'author': author_name,
'datePublished': article.get('datePublished'),
'dateModified': article.get('dateModified'),
'image': article.get('image'),
'publisher': article.get('publisher', {}).get('name'),
})
print(f"Found {len(articles)} articles")
Job Postings¶
options = StructuredDataOptions(
allowed_types=['JobPosting'],
)
graph = gc.crawl(
"https://jobs.example.com",
plugins=[StructuredDataPlugin(options)]
)
jobs = []
for node in graph:
sd = node.user_data.get('structured_data')
if sd:
for job in sd.get_all_of_type('JobPosting'):
salary = job.get('baseSalary', {})
jobs.append({
'url': node.url,
'title': job.get('title'),
'company': job.get('hiringOrganization', {}).get('name'),
'location': job.get('jobLocation', {}).get('address', {}).get('addressLocality'),
'salary_min': salary.get('value', {}).get('minValue') if isinstance(salary.get('value'), dict) else None,
'salary_max': salary.get('value', {}).get('maxValue') if isinstance(salary.get('value'), dict) else None,
'salary_currency': salary.get('currency'),
'employment_type': job.get('employmentType'),
'date_posted': job.get('datePosted'),
'valid_through': job.get('validThrough'),
})
print(f"Found {len(jobs)} job postings")
Social Media Preview¶
graph = gc.crawl(
"https://example.com",
plugins=[StructuredDataPlugin()]
)
for node in graph:
sd = node.user_data.get('structured_data')
if sd:
og = sd.open_graph
twitter = sd.twitter_cards
# Fallback chain: OG -> Twitter -> JSON-LD
title = (
og.get('og:title') or
twitter.get('twitter:title') or
sd.get_property('name')
)
description = (
og.get('og:description') or
twitter.get('twitter:description') or
sd.get_property('description')
)
image = (
og.get('og:image') or
twitter.get('twitter:image') or
sd.get_property('image')
)
print(f"\n{node.url}:")
print(f" Title: {title}")
print(f" Description: {description[:100] if description else None}...")
print(f" Image: {image}")
SEO Audit¶
def audit_structured_data(node):
"""ΠΡΠ΄ΠΈΡ ΡΡΡΡΠΊΡΡΡΠΎΠ²Π°Π½ΠΈΡ
Π΄Π°Π½ΠΈΡ
Π΄Π»Ρ SEO."""
sd = node.user_data.get('structured_data')
issues = []
if not sd or not sd.has_data:
issues.append("No structured data found")
return issues
# JSON-LD check
if not sd.json_ld:
issues.append("No JSON-LD found (recommended by Google)")
# Open Graph check
og = sd.open_graph
if not og.get('og:title'):
issues.append("Missing og:title")
if not og.get('og:description'):
issues.append("Missing og:description")
if not og.get('og:image'):
issues.append("Missing og:image")
# Twitter Cards check
twitter = sd.twitter_cards
if not twitter.get('twitter:card'):
issues.append("Missing twitter:card")
return issues
graph = gc.crawl(
"https://example.com",
plugins=[StructuredDataPlugin()]
)
for node in graph:
issues = audit_structured_data(node)
if issues:
print(f"\n{node.url}:")
for issue in issues:
print(f" β οΈ {issue}")
ΠΠ°ΡΡΠ΅ΡΠΈ¶
GraphCrawler Π²ΠΊΠ»ΡΡΠ°Ρ ΠΎΠΊΡΠ΅ΠΌΡ ΠΏΠ°ΡΡΠ΅ΡΠΈ Π΄Π»Ρ ΠΊΠΎΠΆΠ½ΠΎΠ³ΠΎ ΡΠΎΡΠΌΠ°ΡΡ:
from graph_crawler.extensions.plugins.node.structured_data import (
JsonLdParser,
OpenGraphParser,
TwitterCardsParser,
MicrodataParser,
RdfaParser,
)
# Standalone Π²ΠΈΠΊΠΎΡΠΈΡΡΠ°Π½Π½Ρ
parser = JsonLdParser()
json_ld_data = parser.parse(html_content)
og_parser = OpenGraphParser()
og_data = og_parser.parse(html_content)
SchemaType Enum¶
from graph_crawler.extensions.plugins.node.structured_data import SchemaType
# Π’ΠΈΠΏΠΎΠ²Ρ schema.org ΡΠΈΠΏΠΈ
SchemaType.PRODUCT
SchemaType.OFFER
SchemaType.ARTICLE
SchemaType.ORGANIZATION
SchemaType.PERSON
SchemaType.EVENT
SchemaType.JOB_POSTING
# ... ΡΠ° ΡΠ½ΡΡ
ΠΠ±ΡΠΎΠ±ΠΊΠ° ΠΏΠΎΠΌΠΈΠ»ΠΎΠΊ¶
options = StructuredDataOptions(
fail_silently=True, # ΠΠ° Π·Π°ΠΌΠΎΠ²ΡΡΠ²Π°Π½Π½ΡΠΌ
)
graph = gc.crawl(url, plugins=[StructuredDataPlugin(options)])
for node in graph:
sd = node.user_data.get('structured_data')
# ΠΠ΅ΡΠ΅Π²ΡΡΠΊΠ° Π½Π° ΠΏΠΎΠΌΠΈΠ»ΠΊΠΈ
if sd.has_errors:
print(f"Errors on {node.url}:")
for error in sd.errors:
print(f" - {error}")
# ΠΠ΅ΡΠ΅Π²ΡΡΠΊΠ° ΡΠΈ ΡΠ΅Π·ΡΠ»ΡΡΠ°Ρ ΠΏΠΎΡΠΎΠΆΠ½ΡΠΉ ΡΠ΅ΡΠ΅Π· ΠΏΠΎΠΌΠΈΠ»ΠΊΡ
if sd.is_error_result:
print(f"Failed to parse: {sd.error_message}")
Performance Tips¶
- ΠΠΈΠΌΠΊΠ½ΡΡΡ Π½Π΅ΠΏΠΎΡΡΡΠ±Π½Ρ ΠΏΠ°ΡΡΠ΅ΡΠΈ:
options = StructuredDataOptions(
parse_rdfa=False, # RDFa ΠΏΠΎΠ²ΡΠ»ΡΠ½ΠΈΠΉ
parse_microdata=False, # Π―ΠΊΡΠΎ Π½Π΅ ΠΏΠΎΡΡΡΠ±Π΅Π½
)
- ΠΠ±ΠΌΠ΅ΠΆΡΠ΅ ΡΠΈΠΏΠΈ:
- ΠΡΡΠ°Π½ΠΎΠ²ΡΡΡ Π»ΡΠΌΡΡΠΈ: