some refactoring

This commit is contained in:
nanos 2023-04-03 08:36:09 +01:00
parent 56039cfdea
commit 2b707e7807
2 changed files with 92 additions and 124 deletions

View file

@ -33,7 +33,7 @@ jobs:
path: artifacts
- name: Get Directory structure
run: ls -lR
- run: python find_posts.py --lock-hours=0 --access-token=${{ secrets.ACCESS_TOKEN }} --server=${{ vars.MASTODON_SERVER }} --reply-interval-in-hours=${{ vars.REPLY_INTERVAL_IN_HOURS || 0 }} --home-timeline-length=${{ vars.HOME_TIMELINE_LENGTH || 0 }} --max-followings=${{ vars.MAX_FOLLOWINGS || 0 }} --user=${{ vars.USER }} --max-followers=${{ vars.MAX_FOLLOWERS || 0 }} --http-timeout=${{ vars.HTTP_TIMEOUT || 5 }} --max-follow-requests=${{ vars.MAX_FOLLOW_REQUESTS || 0 }} --on-fail=${{ vars.ON_FAIL }} --on-start=${{ vars.ON_START }} --on-done=${{ vars.ON_DONE }} --max-bookmarks=${{ vars.MAX_BOOKMARKS || 0 }} --remember-users-for-hours=${{ vars.REMEMBER_USERS_FOR_HOURS || 168 }} --from-notifications=${{ vars.FROM_NOTIFICATIONS || 0 }} --backfill-with-context=${{ vars.BACKFILL_WITH_CONTEXT || 0 }}
- run: python find_posts.py --lock-hours=0 --access-token=${{ secrets.ACCESS_TOKEN }} --server=${{ vars.MASTODON_SERVER }} --reply-interval-in-hours=${{ vars.REPLY_INTERVAL_IN_HOURS || 0 }} --home-timeline-length=${{ vars.HOME_TIMELINE_LENGTH || 0 }} --max-followings=${{ vars.MAX_FOLLOWINGS || 0 }} --user=${{ vars.USER }} --max-followers=${{ vars.MAX_FOLLOWERS || 0 }} --http-timeout=${{ vars.HTTP_TIMEOUT || 5 }} --max-follow-requests=${{ vars.MAX_FOLLOW_REQUESTS || 0 }} --on-fail=${{ vars.ON_FAIL }} --on-start=${{ vars.ON_START }} --on-done=${{ vars.ON_DONE }} --max-bookmarks=${{ vars.MAX_BOOKMARKS || 0 }} --remember-users-for-hours=${{ vars.REMEMBER_USERS_FOR_HOURS || 168 }} --from-notifications=${{ vars.FROM_NOTIFICATIONS || 0 }} --backfill-with-context=${{ vars.BACKFILL_WITH_CONTEXT || 1 }} --backfill-mentioned-users=${{ vars.BACKFILL_MENTIONED_USERS || 1 }}
- name: Upload artifacts
uses: actions/upload-artifact@v3
with:

View file

@ -26,103 +26,13 @@ argparser.add_argument('--max-bookmarks', required = False, type=int, default=0,
argparser.add_argument('--from-notifications', required = False, type=int, default=0, help="Backfill accounts of anyone appearing in your notifications, during the last hours")
argparser.add_argument('--remember-users-for-hours', required=False, type=int, default=24*7, help="How long to remember users that you aren't following for, before trying to backfill them again.")
argparser.add_argument('--http-timeout', required = False, type=int, default=5, help="The timeout for any HTTP requests to your own, or other instances.")
argparser.add_argument('--backfill-with-context', required = False, type=bool, default=False, help="Backfill with context?")
argparser.add_argument('--backfill-with-context', required = False, type=bool, default=True, help="If enabled, we'll fetch remote replies when backfilling profiles.")
argparser.add_argument('--backfill-mentioned-users', required = False, type=bool, default=True, help="If enabled, we'll backfill any mentioned users when fetching remote replies to timeline posts.")
argparser.add_argument('--lock-hours', required = False, type=int, default=24, help="The lock timeout in hours.")
argparser.add_argument('--on-done', required = False, default=None, help="Provide a url that will be pinged when processing has completed. You can use this for 'dead man switch' monitoring of your task")
argparser.add_argument('--on-start', required = False, default=None, help="Provide a url that will be pinged when processing is starting. You can use this for 'dead man switch' monitoring of your task")
argparser.add_argument('--on-fail', required = False, default=None, help="Provide a url that will be pinged when processing has failed. You can use this for 'dead man switch' monitoring of your task")
def pull_context(
server,
access_token,
seen_urls,
replied_toot_server_ids,
reply_interval_hours,
max_home_timeline_length,
max_followings,
backfill_followings_for_user,
known_followings,
max_followers,
max_follow_requests,
max_bookmarks,
recently_checked_users,
from_notifications
):
parsed_urls = {}
all_known_users = OrderedSet(list(known_followings) + list(recently_checked_users))
if reply_interval_hours > 0:
"""pull the context toots of toots user replied to, from their
original server, and add them to the local server."""
user_ids = get_active_user_ids(server, access_token, reply_interval_hours)
reply_toots = get_all_reply_toots(
server, user_ids, access_token, seen_urls, reply_interval_hours
)
known_context_urls = get_all_known_context_urls(server, reply_toots,parsed_urls)
seen_urls.update(known_context_urls)
replied_toot_ids = get_all_replied_toot_server_ids(
server, reply_toots, replied_toot_server_ids, parsed_urls
)
context_urls = get_all_context_urls(server, replied_toot_ids)
add_context_urls(server, access_token, context_urls, seen_urls)
if max_home_timeline_length > 0:
"""Do the same with any toots on the key owner's home timeline """
timeline_toots = get_timeline(server, access_token, max_home_timeline_length)
known_context_urls = get_all_known_context_urls(server, timeline_toots,parsed_urls)
add_context_urls(server, access_token, known_context_urls, seen_urls)
# Backfill any post authors, and any mentioned users
mentioned_users = []
for toot in timeline_toots:
these_users = []
toot_created_at = parser.parse(toot['created_at'])
cutoff = datetime.now(datetime.now().astimezone().tzinfo) - timedelta(minutes=60)
if(len(mentioned_users) < 10 or toot_created_at > cutoff):
these_users.append(toot['account'])
if(len(toot['mentions'])):
these_users += toot['mentions']
if(toot['reblog'] != None):
these_users.append(toot['reblog']['account'])
if(len(toot['reblog']['mentions'])):
these_users += toot['reblog']['mentions']
for user in these_users:
if user not in mentioned_users and user['acct'] not in all_known_users:
mentioned_users.append(user)
add_user_posts(server, access_token, filter_known_users(mentioned_users, all_known_users), recently_checked_users, all_known_users, seen_urls)
if max_followings > 0:
log(f"Getting posts from last {max_followings} followings")
user_id = get_user_id(server, backfill_followings_for_user, access_token)
followings = get_new_followings(server, user_id, max_followings, all_known_users)
add_user_posts(server, access_token, followings, known_followings, all_known_users, seen_urls)
if max_followers > 0:
log(f"Getting posts from last {max_followers} followers")
user_id = get_user_id(server, backfill_followings_for_user, access_token)
followers = get_new_followers(server, user_id, max_followers, all_known_users)
add_user_posts(server, access_token, followers, recently_checked_users, all_known_users, seen_urls)
if max_follow_requests > 0:
log(f"Getting posts from last {max_follow_requests} follow requests")
follow_requests = get_new_follow_requests(server, access_token, max_follow_requests, all_known_users)
add_user_posts(server, access_token, follow_requests, recently_checked_users, all_known_users, seen_urls)
if from_notifications > 0:
log(f"Getting notifications for last {from_notifications} hours")
notification_users = get_notification_users(server, access_token, all_known_users, from_notifications)
add_user_posts(server, access_token, notification_users, recently_checked_users, all_known_users, seen_urls)
if max_bookmarks > 0:
log(f"Pulling replies to the last {max_bookmarks} bookmarks")
bookmarks = get_bookmarks(server, access_token, max_bookmarks)
known_context_urls = get_all_known_context_urls(server, bookmarks,parsed_urls)
add_context_urls(server, access_token, known_context_urls, seen_urls)
def get_notification_users(server, access_token, known_users, max_age):
since = datetime.now(datetime.now().astimezone().tzinfo) - timedelta(hours=max_age)
notifications = get_paginated_mastodon(f"https://{server}/api/v1/notifications", since, headers={
@ -881,61 +791,119 @@ if __name__ == "__main__":
RECENTLY_CHECKED_USERS_FILE = "artifacts/recently_checked_users"
SEEN_URLS = OrderedSet([])
seen_urls = OrderedSet([])
if os.path.exists(SEEN_URLS_FILE):
with open(SEEN_URLS_FILE, "r", encoding="utf-8") as f:
SEEN_URLS = OrderedSet(f.read().splitlines())
seen_urls = OrderedSet(f.read().splitlines())
REPLIED_TOOT_SERVER_IDS = {}
replied_toot_server_ids = {}
if os.path.exists(REPLIED_TOOT_SERVER_IDS_FILE):
with open(REPLIED_TOOT_SERVER_IDS_FILE, "r", encoding="utf-8") as f:
REPLIED_TOOT_SERVER_IDS = json.load(f)
replied_toot_server_ids = json.load(f)
KNOWN_FOLLOWINGS = OrderedSet([])
known_followings = OrderedSet([])
if os.path.exists(KNOWN_FOLLOWINGS_FILE):
with open(KNOWN_FOLLOWINGS_FILE, "r", encoding="utf-8") as f:
KNOWN_FOLLOWINGS = OrderedSet(f.read().splitlines())
known_followings = OrderedSet(f.read().splitlines())
RECENTLY_CHECKED_USERS = OrderedSet({})
recently_checked_users = OrderedSet({})
if os.path.exists(RECENTLY_CHECKED_USERS_FILE):
with open(RECENTLY_CHECKED_USERS_FILE, "r", encoding="utf-8") as f:
RECENTLY_CHECKED_USERS = OrderedSet(json.load(f))
recently_checked_users = OrderedSet(json.load(f))
# Remove any users whose last check is too long in the past from the list
for user in list(RECENTLY_CHECKED_USERS):
lastCheck = RECENTLY_CHECKED_USERS.get(user)
for user in list(recently_checked_users):
lastCheck = recently_checked_users.get(user)
userAge = datetime.now(lastCheck.tzinfo) - lastCheck
if(userAge.total_seconds() > arguments.remember_users_for_hours * 60 * 60):
RECENTLY_CHECKED_USERS.pop(user)
recently_checked_users.pop(user)
pull_context(
arguments.server,
arguments.access_token,
SEEN_URLS,
REPLIED_TOOT_SERVER_IDS,
arguments.reply_interval_in_hours,
arguments.home_timeline_length,
arguments.max_followings,
arguments.user,
KNOWN_FOLLOWINGS,
arguments.max_followers,
arguments.max_follow_requests,
arguments.max_bookmarks,
RECENTLY_CHECKED_USERS,
arguments.from_notifications,
)
parsed_urls = {}
all_known_users = OrderedSet(list(known_followings) + list(recently_checked_users))
if arguments.reply_interval_hours > 0:
"""pull the context toots of toots user replied to, from their
original server, and add them to the local server."""
user_ids = get_active_user_ids(arguments.server, arguments.access_token, arguments.reply_interval_hours)
reply_toots = get_all_reply_toots(
arguments.server, user_ids, arguments.access_token, seen_urls, arguments.reply_interval_hours
)
known_context_urls = get_all_known_context_urls(arguments.server, reply_toots,parsed_urls)
seen_urls.update(known_context_urls)
replied_toot_ids = get_all_replied_toot_server_ids(
arguments.server, reply_toots, replied_toot_server_ids, parsed_urls
)
context_urls = get_all_context_urls(arguments.server, replied_toot_ids)
add_context_urls(arguments.server, arguments.access_token, context_urls, seen_urls)
if arguments.max_home_timeline_length > 0:
"""Do the same with any toots on the key owner's home timeline """
timeline_toots = get_timeline(arguments.server, arguments.access_token, arguments.max_home_timeline_length)
known_context_urls = get_all_known_context_urls(arguments.server, timeline_toots,parsed_urls)
add_context_urls(arguments.server, arguments.access_token, known_context_urls, seen_urls)
# Backfill any post authors, and any mentioned users
if arguments.backfill_mentioned_users:
mentioned_users = []
cut_off = datetime.now(datetime.now().astimezone().tzinfo) - timedelta(minutes=60)
for toot in timeline_toots:
these_users = []
toot_created_at = parser.parse(toot['created_at'])
if len(mentioned_users) < 10 or (toot_created_at > cut_off and len(mentioned_users) < 30):
these_users.append(toot['account'])
if(len(toot['mentions'])):
these_users += toot['mentions']
if(toot['reblog'] != None):
these_users.append(toot['reblog']['account'])
if(len(toot['reblog']['mentions'])):
these_users += toot['reblog']['mentions']
for user in these_users:
if user not in mentioned_users and user['acct'] not in all_known_users:
mentioned_users.append(user)
add_user_posts(arguments.server, arguments.access_token, filter_known_users(mentioned_users, all_known_users), recently_checked_users, all_known_users, seen_urls)
if arguments.max_followings > 0:
log(f"Getting posts from last {arguments.max_followings} followings")
user_id = get_user_id(arguments.server, arguments.backfill_followings_for_user, arguments.access_token)
followings = get_new_followings(arguments.server, user_id, arguments.max_followings, all_known_users)
add_user_posts(arguments.server, arguments.access_token, followings, known_followings, all_known_users, seen_urls)
if arguments.max_followers > 0:
log(f"Getting posts from last {arguments.max_followers} followers")
user_id = get_user_id(arguments.server, arguments.backfill_followings_for_user, arguments.access_token)
followers = get_new_followers(arguments.server, user_id, arguments.max_followers, all_known_users)
add_user_posts(arguments.server, arguments.access_token, followers, recently_checked_users, all_known_users, seen_urls)
if arguments.max_follow_requests > 0:
log(f"Getting posts from last {arguments.max_follow_requests} follow requests")
follow_requests = get_new_follow_requests(arguments.server, arguments.access_token, arguments.max_follow_requests, all_known_users)
add_user_posts(arguments.server, arguments.access_token, follow_requests, recently_checked_users, all_known_users, seen_urls)
if arguments.from_notifications > 0:
log(f"Getting notifications for last {arguments.from_notifications} hours")
notification_users = get_notification_users(arguments.server, arguments.access_token, all_known_users, arguments.from_notifications)
add_user_posts(arguments.server, arguments.access_token, notification_users, recently_checked_users, all_known_users, seen_urls)
if arguments.max_bookmarks > 0:
log(f"Pulling replies to the last {arguments.max_bookmarks} bookmarks")
bookmarks = get_bookmarks(arguments.server, arguments.access_token, arguments.max_bookmarks)
known_context_urls = get_all_known_context_urls(arguments.server, bookmarks,parsed_urls)
add_context_urls(arguments.server, arguments.access_token, known_context_urls, seen_urls)
with open(KNOWN_FOLLOWINGS_FILE, "w", encoding="utf-8") as f:
f.write("\n".join(list(KNOWN_FOLLOWINGS)[-10000:]))
f.write("\n".join(list(known_followings)[-10000:]))
with open(SEEN_URLS_FILE, "w", encoding="utf-8") as f:
f.write("\n".join(list(SEEN_URLS)[-10000:]))
f.write("\n".join(list(seen_urls)[-10000:]))
with open(REPLIED_TOOT_SERVER_IDS_FILE, "w", encoding="utf-8") as f:
json.dump(dict(list(REPLIED_TOOT_SERVER_IDS.items())[-10000:]), f)
json.dump(dict(list(replied_toot_server_ids.items())[-10000:]), f)
with open(RECENTLY_CHECKED_USERS_FILE, "w", encoding="utf-8") as f:
RECENTLY_CHECKED_USERS.toJSON()
recently_checked_users.toJSON()
os.remove(LOCK_FILE)