To learn more about Cloud Functions, I decided to implement a scraping script. The function loadds the https://www.bbc.com/, encodes it using base64, then publishes the result to a Pub/Sub topic. A Dataflow job is subscribed to this topic, and streams the messages to BigQuery. Interestingly, messages longer than 65521 do not arrive.
My cloud function:
def hello_pubsub(event, context):
import re
import json
import base64
import requests
import bs4 as bs
from google.cloud import pubsub_v1
def publish(message):
project_id = "adventdalen"
topic_name = "scrape"
publisher = pubsub_v1.PublisherClient()
topic_path = publisher.topic_path(project_id, topic_name)
future = publisher.publish(
topic_path, data=message.encode('utf-8')
)
url = "https://www.bbc.com/"
page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'})
bspage = bs.BeautifulSoup(page.text, 'html.parser')
decoded = base64.b64encode(bspage.encode('ascii')).decode('ascii')
print(decoded)
publish(json.dumps({"html":decoded[:100]})) # <- this makes into bigquery
publish(json.dumps({"html":decoded[:1000]})) # <- this makes into bigquery
publish(json.dumps({"html":decoded[:20000]})) # <- this makes into bigquery
publish(json.dumps({"html":decoded[:40000]})) # <- this makes into bigquery
publish(json.dumps({"html":decoded[:60000]})) # <- this makes into bigquery
publish(json.dumps({"html":decoded[:62000]})) # <- this makes into bigquery
publish(json.dumps({"html":decoded[:64000]})) # <- this makes into bigquery
publish(json.dumps({"html":decoded[:64800]})) # <- this makes into bigquery
publish(json.dumps({"html":decoded[:64900]})) # <- this makes into bigquery
publish(json.dumps({"html":decoded[:65000]})) # <- this makes into bigquery
publish(json.dumps({"html":decoded[:65100]})) # <- this makes into bigquery
publish(json.dumps({"html":decoded[:65200]})) # <- this makes into bigquery
publish(json.dumps({"html":decoded[:65500]})) # <- this makes into bigquery
publish(json.dumps({"html":decoded[:65520]})) # <- this makes into bigquery
publish(json.dumps({"html":decoded[:65521]})) # <- this makes into bigquery
publish(json.dumps({"html":decoded[:65522]})) # <- this does not
publish(json.dumps({"html":decoded[:65524]})) # <- this does not
publish(json.dumps({"html":decoded[:65528]})) # <- this does not
publish(json.dumps({"html":decoded[:65532]})) # <- this does not
publish(json.dumps({"html":decoded[:65536]})) # <- this does not
publish(json.dumps({"html":decoded[:65540]})) # <- this does not
publish(json.dumps({"html":decoded[:65560]})) # <- this does not
publish(json.dumps({"html":decoded[:65580]})) # <- this does not
publish(json.dumps({"html":decoded[:65600]})) # <- this does not
publish(json.dumps({"html":decoded[:65700]})) # <- this does not
publish(json.dumps({"html":decoded[:65800]})) # <- this does not
publish(json.dumps({"html":decoded[:65900]})) # <- this does not
publish(json.dumps({"html":decoded[:66000]})) # <- this does not
publish(json.dumps({"html":decoded[:68000]})) # <- this does not
publish(json.dumps({"html":decoded[:70000]})) # <- this does not
publish(json.dumps({"html":decoded[:90000]})) # <- this does not
I check rows in my BigQuery table, and the longest row is 65521.
What am I doing wrong? I haven't found any quota limit which would explain this observation. (I also checked decoded
, that string is around 280k long, so definitely long enough to have longer rows in BQ than 65521.)