Helpex - Trao đổi & giúp đỡ Đăng nhập
4

I’m trying to make a scraper that returns data for daily flights between airports in Europe for a list of European airlines. For KLM, the data can be found on the following website by clicking on the dots on the map (data shows up in a table under the map): https://www.flightradar24.com/data/airlines/kl-klm/routes

I currently have the following code:

import requests
import json
import datetime
import pandas as pd

myProxy = {"http"  : "http://10.120.118.49:8080", "https"  : "https://10.120.118.49:8080"}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0"}

eu_airports = ['AAL', 'AAR', 'ABZ', 'ACE', 'ADA', 'ADB', 'AER', 'AES', 'AGP', 'AHO', 'AJA', 'ALA', 'ALC', 'AMS',
           'ANR', 'AOI', 'ARN', 'ATH', 'AYT', 'BCN', 'BDS', 'BEG', 'BER', 'BES', 'BFS', 'BGO', 'BGY', 'BHD', 
           'BHX', 'BIA', 'BIO', 'BIQ', 'BJV', 'BLL', 'BLQ', 'BMA', 'BOD', 'BOJ', 'BOO', 'BRE', 'BRI', 'BRN', 
           'BRQ', 'BRS', 'BRU', 'BTS', 'BUD', 'BVA', 'CAG', 'CDG', 'CFU', 'CGN', 'CHQ', 'CIA', 'CIY', 'CLJ', 
           'CPH', 'CRL', 'CTA', 'CWL', 'DBV', 'DEB', 'DLM', 'DME', 'DRS', 'DTM', 'DUB', 'DUS', 'EDI', 'EGC', 
           'EIN', 'EMA', 'ESB', 'EVN', 'FAO', 'FCO', 'FDH', 'FKB', 'FLR', 'FMM', 'FMO', 'FNC', 'FRA', 'FSC', 
           'FUE', 'GDN', 'GLA', 'GOA', 'GOT', 'GRO', 'GRQ', 'GRZ', 'GVA', 'GYD', 'HAJ', 'HAM', 'HAU', 'HEL', 
           'HER', 'HHN', 'HUY', 'IAS', 'IBZ', 'IEV', 'INI', 'INN', 'IST', 'JER', 'JMK', 'JTR', 'KBP', 'KEF', 
           'KGS', 'KIR', 'KIV', 'KLU', 'KRK', 'KRS', 'KTW', 'KUN', 'LBA', 'LCA', 'LCY', 'LED', 'LEI', 'LEJ', 
           'LGG', 'LGW', 'LHR', 'LIL', 'LIN', 'LIS', 'LJU', 'LNZ', 'LPA', 'LPL', 'LTN', 'LUG', 'LUX', 'LYS', 
           'MAD', 'MAH', 'MAN', 'MJV', 'MLA', 'MMX', 'MPL', 'MRS', 'MSQ', 'MST', 'MUC', 'MXP', 'NAP', 'NCE', 
           'NCL', 'NOC', 'NRN', 'NTE', 'NUE', 'NYO', 'ODS', 'OLB', 'OPO', 'ORK', 'ORY', 'OSL', 'OST', 'OTP', 
           'OUL', 'PAD', 'PDL', 'PEG', 'PFO', 'PIK', 'PMI', 'PMO', 'POZ', 'PRG', 'PRN', 'PSA', 'PSR', 'PUY', 
           'REU', 'RHO', 'RIX', 'RTM', 'RVN', 'SAW', 'SCQ', 'SDR', 'SEN', 'SJJ', 'SKG', 'SKP', 'SNN', 'SOF', 
           'SOU', 'SPU', 'STN', 'STR', 'SUF', 'SVG', 'SVO', 'SVQ', 'SVX', 'SXB', 'SXF', 'SZG', 'TBS', 'TFN', 
           'TFS', 'TGD', 'TIA', 'TIV', 'TKU', 'TLL', 'TLN', 'TLS', 'TMP', 'TOS', 'TPS', 'TRD', 'TRF', 'TRN', 
           'TSE', 'TSF', 'TSR', 'TXL', 'TZL', 'TZX', 'VAA', 'VAR', 'VCE', 'VIE', 'VKO', 'VLC', 'VNO', 'VRN', 
           'VST', 'WAW', 'WMI', 'WRO', 'XRY', 'ZAD', 'ZAG', 'ZAZ', 'ZRH', 'ZTH']

eu_countries = ['Albania', 'Armenia', 'Austria', 'Azerbaijan', 'Belarus', 'Belgium', 'Bosnia And Herzegovina', 
            'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia', 'Faroe Islands', 'Finland', 
            'France', 'Georgia', 'Germany', 'Gibraltar', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 
            'Kosovo', 'Latvia', 'Lithuania', 'Luxembourg', 'Macedonia', 'Malta', 'Moldova', 'Monaco', 'Montenegro', 
            'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania', 'Russia', 'Serbia', 'Slovakia', 'Slovenia', 
            'Spain', 'Sweden', 'Switzerland', 'Ukraine', 'United Kingdom']

"""
eu_airlines_names = ['Aegean Airlines', 'Aer Lingus', 'Aeroflot', 'Air Baltic', 'Air Europa', 'Air France', 'Alitalia', 
                 'Austrian Airlines', 'Blue Air', 'BRA', 'British Airways', 'Brussels Airlines', 'Condor', 'EasyJet', 
                 'Eurowings', 'Finnair', 'Flybe', 'Germania', 'HOP!', 'Iberia', 'Icelandair', 'Jet2', 'KLM', 'LOT', 
                 'Lufthansa', 'Norwegian', 'Ryanair', 'S7 Airlines', 'SAS', 'Swiftair', 'Swiss', 'TAP Portugal', 
                 'Thomas Cook Airlines', 'Transavia', 'Travel Service', 'TUI fly', 'Ukraine Int. Airlines', 'Ural Airlines', 
                 'Virgin Atlantic', 'Volotea', 'Vueling', 'Wideroe', 'Wizz Air']

eu_airlines_iata = ['a3-aee', 'ei-ein', 'su-afl', 'bt-bti', 'ux-aea', 'af-afr', 'az-aza', 'os-aua', '0b-bms', 'tf-brx', 
                'ba-baw', 'sn-bel', 'de-cfg', 'u2-ezy', 'ew-ewg', 'ay-fin', 'be-bee', 'st-gmi', 'a5-hop', 'ib-ibe', 
                'fi-ice', 'ls-exs', 'kl-klm', 'lo-lot', 'lh-dlh', 'dy-nax', 'fr-ryr', 's7-sbi', 'sk-sas', 'wt-swt', 
                'lx-swr', 'tp-tap', 'mt-tcx', 'hv-tra', 'qs-tvs', 'x3-tui', 'ps-aui', 'u6-svr', 'vs-vir', 'v7-voe', 
                'vy-vlg', 'wf-wif', 'w6-wzz']
"""
eu_airlines_names = ['KLM']
eu_airlines_iata = ['kl-klm']

for airline in eu_airlines_iata:
    s = requests.session()
    r = s.get('https://www.flightradar24.com/data/airlines/' + airline + '/routes', proxies = myProxy, headers = headers)
    my_json = json.loads(r.text.split('arrRoutes=')[-1].split(', arrDates=')[0])
    iata_list = [element[item]['iata'] for element in my_json for item in element]

iata_list2 = []
iata_list1 = set(iata_list)

for i in iata_list1:
    if i not in eu_airports:
        pass
    else:
        iata_list2.append(i)

print(len(iata_list2))

today = datetime.datetime.today()
tomorrow1 = datetime.datetime.today() + datetime.timedelta(1)
tomorrow2 = datetime.datetime.today() + datetime.timedelta(2)
tomorrow3 = datetime.datetime.today() + datetime.timedelta(3)
tomorrow4 = datetime.datetime.today() + datetime.timedelta(4)
tomorrow5 = datetime.datetime.today() + datetime.timedelta(5)
tomorrow6 = datetime.datetime.today() + datetime.timedelta(6)

date = datetime.datetime.strftime(today, "%Y-%m-%d")
date1 = datetime.datetime.strftime(tomorrow1, "%Y-%m-%d")
date2 = datetime.datetime.strftime(tomorrow2, "%Y-%m-%d")
date3 = datetime.datetime.strftime(tomorrow3, "%Y-%m-%d")
date4 = datetime.datetime.strftime(tomorrow4, "%Y-%m-%d")
date5 = datetime.datetime.strftime(tomorrow5, "%Y-%m-%d")
date6 = datetime.datetime.strftime(tomorrow6, "%Y-%m-%d")

countries = []
airports_departure = []
airports_arrival = []
dailyflights = []
distances = []
flights = []
aircrafts = []
airlines = []

for airline, name in zip(eu_airlines_iata, eu_airlines_names):
    url = 'https://www.flightradar24.com/data/airlines/' + airline + '/routes?get-airport-arr-dep={}'
    print(url)

    for abbr in iata_list2:
        try:
            cookie = r.cookies.get_dict()
            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0", "Content-Type": "application/json", "x-fetch": "true"}
            response = s.get(url.format(abbr), cookies=cookie, headers=headers, proxies = myProxy).json()

            for country in response['arrivals']:
                if country in eu_countries:
                    countries.append(country)
                    daily = response['arrivals'][country]['number']['flights']

                if abbr not in airports_departure and abbr not in airports_arrival:
                    for iata in response['arrivals'][country]['airports']:
                        if iata in eu_airports and abbr not in airports_departure:
                            airports_arrival.append(iata)
                            dist = response['arrivals'][country]['airports'][iata]['distance']
                            distances.append(int(round(dist/1000)))
                            for flight in response['arrivals'][country]['airports'][iata]['flights']:
                                aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date]["aircraft"]

                            print('Scraping data...')

                            if abbr not in airports_departure:
                                airports_departure.append(abbr)
                                aircrafts.append(aircr)
                                airlines.append(name)
                                dailyflights.append(daily)

                else:
                    pass                    

        except (IndexError, KeyError, TypeError, ValueError):
            try:
                if abbr not in airports_departure:
                    aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date2]["aircraft"]
                    aircrafts.append(aircr)  
                    airlines.append(name)
                    airports_departure.append(abbr)
                    dailyflights.append(daily)
            except (IndexError, KeyError, TypeError, ValueError):
                try:
                    if abbr not in airports_departure:
                        aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date3]["aircraft"]
                        aircrafts.append(aircr)  
                        airlines.append(name)
                        airports_departure.append(abbr)
                        dailyflights.append(daily)
                except (IndexError, KeyError, TypeError, ValueError):
                    try:
                        if abbr not in airports_departure:
                            aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date4]["aircraft"]
                            aircrafts.append(aircr)  
                            airlines.append(name)
                            airports_departure.append(abbr)
                            dailyflights.append(daily)
                    except (IndexError, KeyError, TypeError, ValueError):
                        try:
                            if abbr not in airports_departure:
                                aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date5]["aircraft"]
                                aircrafts.append(aircr)  
                                airlines.append(name)
                                airports_departure.append(abbr)
                                dailyflights.append(daily)
                        except (IndexError, KeyError, TypeError, ValueError):
                            try:
                                if abbr not in airports_departure:
                                    aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date6]["aircraft"]
                                    aircrafts.append(aircr)  
                                    airlines.append(name)
                                    airports_departure.append(abbr)
                                    dailyflights.append(daily)
                            except (IndexError, KeyError, TypeError, ValueError):
                                if abbr not in airports_departure:
                                    aircrafts.append('')
                                    airlines.append('')
                                    airports_departure.append('')
                                    dailyflights.append(0)


print('Airline: ' + str(airlines))
print('Departure: ' + str(airports_departure))
print('Arrival: ' + str(airports_arrival))
print('Aircraft types: ' + str(aircrafts))
print('Distance (km): ' + str(distances))
print('Daily flights: ' + str(dailyflights))

print('Airline:           ' + str(len(airlines)))
print('Departure:         ' + str(len(airports_departure)))
print('Arrival:           ' + str(len(airports_arrival)))
print('Aircrafts:         ' + str(len(aircrafts)))
print('Distance:          ' + str(len(distances)))
print('Daily flights:     ' + str(len(dailyflights)))
print('Sum daily flights: ' + str(sum(dailyflights)))


df = pd.DataFrame({'Airline': airlines, 
                   'Departure': airports_departure, 
                   'Arrivals': airports_arrival, 
                   'Aircraft': aircrafts, 
                   'Distance': distances,
                   'Daily flights': dailyflights})
print(df)    

This works fine for KLM, as it only has one airport hub (Schiphol) for all its flights. However, I encounter a problem when trying to scrape data for an airline such as Ryanair, which has multiple hubs all over Europe. In the code this would be done by changing the elements in the lists eu_airlines_names and eu_airlines_iata from ‘KLM’ and ‘kl-klm’ to ‘Ryanair’ and ‘fr-ryr’.

How can I adjust the scraper to deal with this? Also, would it be possible to loop over several elements in the list eu_airlines_iata instead of doing it one by one? Additionally, right now the code only scrapes a random aircraft type, but is there a way to instead scrape the type that is most frequently used throughout the week?

The ideal output would be separate lists containing:

  • The airline name
  • Departure airport
  • Arrival airport
  • Most frequently used aircraft type
  • Distance
  • Number of daily flights

for each airline in the list eu_airlines_iata that I have placed in docstring in the code.

4 hữu ích 0 bình luận 2.4k xem chia sẻ
loading
Không tìm thấy câu trả lời bạn tìm kiếm? Duyệt qua các câu hỏi được gắn thẻ python python-3.x web-scraping python-requests web-crawler , hoặc hỏi câu hỏi của bạn.

Có thể bạn quan tâm

loading