diff --git a/parser/cometCalendarParser.go b/parser/cometCalendarParser.go index fd23056..4575b6c 100644 --- a/parser/cometCalendarParser.go +++ b/parser/cometCalendarParser.go @@ -1,3 +1,7 @@ +/* + This file contains the code for the comet calendar events parser. +*/ + package parser import ( @@ -8,14 +12,16 @@ import ( "regexp" "slices" "strings" + "time" + "github.com/UTDNebula/api-tools/scrapers" "github.com/UTDNebula/api-tools/utils" "github.com/UTDNebula/nebula-api/api/schema" ) // Some events have only the building name, not the abbreviation // Maps building names to their abbreviations -var buildingAbbreviations = map[string]string{ +var DefaultBuildings = map[string]string{ "Activity Center": "AB", "Activity Center Bookstore": "ACB", "Administration": "AD", @@ -74,7 +80,7 @@ var buildingAbbreviations = map[string]string{ } // Valid building abreviations for checking -var validAbbreviations []string = []string{ +var DefaultValid []string = []string{ "AB", "ACB", "AD", @@ -146,6 +152,11 @@ func ParseCometCalendar(inDir string, outDir string) { } multiBuildingMap := make(map[string]map[string]map[string][]schema.Event) + // Some events have only the building name, not the abbreviation + buildingAbbreviations, validAbbreviations, err := getLocationAbbreviations(inDir) + if err != nil { + panic(err) + } for _, event := range allEvents { @@ -239,3 +250,52 @@ func ParseCometCalendar(inDir string, outDir string) { utils.WriteJSON(fmt.Sprintf("%s/cometCalendar.json", outDir), result) } + +// getAbbreviations dynamically retrieves the all of the locations abbreviations +func getLocationAbbreviations(inDir string) (map[string]string, []string, error) { + // Get the locations from the map scraper + var mapFile []byte + + mapFile, err := os.ReadFile(inDir + "/mapLocations.json") + if err != nil { + if os.IsNotExist(err) { + // Scrape the data if the it doesn't exist yet and then get the map file + scrapers.ScrapeMapLocations(inDir) + time.Sleep(2 * time.Second) + ParseMapLocations(inDir, inDir) + time.Sleep(2 * time.Second) + + // If fail to get the locations again, it's not because location is unscraped + mapFile, err = os.ReadFile(inDir + "/mapLocations.json") + if err != nil { + return nil, nil, err + } + } else { + return nil, nil, err + } + } + + var locations []schema.MapBuilding + if err = json.Unmarshal(mapFile, &locations); err != nil { + return nil, nil, err + } + + // Process the abbreviations + buildingsAbbreviations := make(map[string]string, 0) // Maps building names to their abbreviations + validAbbreviations := make([]string, 0) // Valid building abreviations for checking + + for _, location := range locations { + // Trim the following acronym in the name + trimmedName := strings.Split(*location.Name, " (")[0] + // Fallback on the locations that have no acronyms + abbreviation := "" + if location.Acronym != nil { + abbreviation = *location.Acronym + } + + buildingsAbbreviations[trimmedName] = abbreviation + validAbbreviations = append(validAbbreviations, abbreviation) + } + + return buildingsAbbreviations, validAbbreviations, nil +} diff --git a/scrapers/cometCalendar.go b/scrapers/cometCalendar.go index 26e42af..3a4f613 100644 --- a/scrapers/cometCalendar.go +++ b/scrapers/cometCalendar.go @@ -1,5 +1,5 @@ /* - This file contains the code for the events scraper. + This file contains the code for the comet calendar events scraper. */ package scrapers @@ -19,9 +19,11 @@ import ( "go.mongodb.org/mongo-driver/bson/primitive" ) +const CAL_URL string = "https://calendar.utdallas.edu/api/2/events" + // RawEvent mirrors the nested event payload returned by the calendar API. type RawEvent struct { - Event map[string]interface{} `json:"event"` + Event map[string]any `json:"event"` } // APICalendarResponse models the calendar API pagination envelope. @@ -31,172 +33,201 @@ type APICalendarResponse struct { Date map[string]string `json:"date"` } -// ScrapeCometCalendar retrieves calendar events through the API and writes normalized JSON output. +// ScrapeCometCalendar retrieves calendar events through the API func ScrapeCometCalendar(outDir string) { err := os.MkdirAll(outDir, 0777) if err != nil { panic(err) } - cli := http.Client{Timeout: 15 * time.Second} + client := http.Client{Timeout: 15 * time.Second} var calendarData APICalendarResponse // Get the total number of pages log.Printf("Getting the number of pages...") - if err := scrapeAndUnmarshal(&cli, 0, &calendarData); err != nil { + + if err := callAndUnmarshal(&client, 0, &calendarData); err != nil { panic(err) } numPages := calendarData.Page["total"] log.Printf("The number of pages is %d!\n\n", numPages) - var events []schema.Event + var calendarEvents []schema.Event for page := range numPages { log.Printf("Scraping events of page %d...", page+1) - if err := scrapeAndUnmarshal(&cli, page+1, &calendarData); err != nil { + if err := callAndUnmarshal(&client, page+1, &calendarData); err != nil { panic(err) } - for _, rawEvent := range calendarData.Events { - // Parse the time - eventInstance := toMap(toMap(toSlice(rawEvent.Event["event_instances"])[0])["event_instance"]) - startTime := parseTime(toString(eventInstance["start"])) - endTime := startTime - if toString(eventInstance["end"]) != "" { - endTime = parseTime(toString(eventInstance["end"])) - } - - // Parse location - location := strings.Trim(fmt.Sprintf("%s, %s", toString(rawEvent.Event["location_name"]), toString(rawEvent.Event["room_number"])), " ,") - - // Parse the event types, event topic, and event target audience - filters := toMap(rawEvent.Event["filters"]) - eventTypes := []string{} - eventTopics := []string{} - targetAudiences := []string{} - - rawTypes := toSlice(filters["event_types"]) - for _, rawType := range rawTypes { - eventTypes = append(eventTypes, toString(toMap(rawType)["name"])) - } - - rawAudiences := toSlice(filters["event_target_audience"]) - for _, audience := range rawAudiences { - targetAudiences = append(targetAudiences, toString(toMap(audience)["name"])) - } - - rawTopics := toSlice(filters["event_topic"]) - for _, topic := range rawTopics { - eventTopics = append(eventTopics, toString(toMap(topic)["name"])) - } - - // Parse the event departments, and tags - departments := []string{} - tags := []string{} - - rawTags := toSlice(rawEvent.Event["tags"]) - for _, tag := range rawTags { - tags = append(tags, tag.(string)) - } - - rawDeparments := toSlice(rawEvent.Event["departments"]) - for _, deparment := range rawDeparments { - departments = append(departments, toMap(deparment)["name"].(string)) - } - - // Parse the contact info, =ote that some events won't have contact phone number - rawContactInfo := toMap(rawEvent.Event["custom_fields"]) - contactInfo := [3]string{} - for i, infoField := range []string{ - "contact_information_name", "contact_information_email", "contact_information_phone", - } { - contactInfo[i] = toString(rawContactInfo[infoField]) - } - - events = append(events, schema.Event{ + // Parse all necessary info + startTime, endTime := getTime(rawEvent) + eventTypes, targetAudiences, eventTopics := getFilters(rawEvent) + departments, tags := getDepartmentsAndTags(rawEvent) + contactInfo := getContactInfo(rawEvent) + + calendarEvents = append(calendarEvents, schema.Event{ Id: primitive.NewObjectID(), - Summary: toString(rawEvent.Event["title"]), - Location: location, + Summary: convert[string](rawEvent.Event["title"]), + Location: getEventLocation(rawEvent), StartTime: startTime, EndTime: endTime, - Description: toString(rawEvent.Event["description_text"]), + Description: convert[string](rawEvent.Event["description_text"]), EventType: eventTypes, TargetAudience: targetAudiences, Topic: eventTopics, EventTags: tags, - EventWebsite: toString(rawEvent.Event["url"]), + EventWebsite: convert[string](rawEvent.Event["url"]), Department: departments, ContactName: contactInfo[0], ContactEmail: contactInfo[1], ContactPhoneNumber: contactInfo[2], }) } + log.Printf("Scraped events of page %d successfully!\n", page+1) } - if err := utils.WriteJSON(fmt.Sprintf("%s/cometCalendarScraped.json", outDir), events); err != nil { + writePath := fmt.Sprintf("%s/cometCalendarScraped.json", outDir) + if err := utils.WriteJSON(writePath, calendarEvents); err != nil { panic(err) } - log.Printf("Finished scraping %d events successfully!\n\n", len(events)) + + log.Printf("Finished scraping %d events successfully!\n\n", len(calendarEvents)) } // scrapeAndUnmarshal fetches a calendar page and decodes it into data. -func scrapeAndUnmarshal(client *http.Client, page int, data *APICalendarResponse) error { +func callAndUnmarshal(client *http.Client, page int, data *APICalendarResponse) error { // Call API to get the byte data - calendarUrl := fmt.Sprintf("https://calendar.utdallas.edu/api/2/events?days=365&pp=100&page=%d", page) - req, err := http.NewRequest("GET", calendarUrl, nil) + calendarUrl := fmt.Sprintf("%s?days=365&pp=100&page=%d", CAL_URL, page) + request, err := http.NewRequest("GET", calendarUrl, nil) if err != nil { return err } - res, err := client.Do(req) + request.Header = http.Header{ + "Content-type": {"application/json"}, + "Accept": {"application/json"}, + } + + response, err := client.Do(request) if err != nil { return err } - if res != nil && res.StatusCode != 200 { - return fmt.Errorf("ERROR: Non-200 status is returned, %s", res.Status) + if response != nil && response.StatusCode != 200 { + return fmt.Errorf("ERROR: Non-200 status is returned, %s", response.Status) } + defer response.Body.Close() // Unmarshal bytes to the response data buffer := bytes.Buffer{} - if _, err = buffer.ReadFrom(res.Body); err != nil { + if _, err = buffer.ReadFrom(response.Body); err != nil { return err } - res.Body.Close() if err = json.Unmarshal(buffer.Bytes(), &data); err != nil { return err } + return nil } -// toSlice attempts to convert data into a slice of interface{}. -func toSlice(data interface{}) []interface{} { - if array, ok := data.([]interface{}); ok { - return array +// getTime parses the start and end time of the event +func getTime(event RawEvent) (time.Time, time.Time) { + instance := convert[map[string]any]( + convert[map[string]any]( + convert[[]any](event.Event["event_instances"])[0])["event_instance"]) + + // Converts RFC3339 timestamp string to time.Time + startTime, err := time.Parse(time.RFC3339, convert[string](instance["start"])) + if err != nil { + panic(err) } - return nil + + var endTime time.Time + if convert[string](instance["end"]) != "" { + endTime, err = time.Parse(time.RFC3339, convert[string](instance["end"])) + if err != nil { + panic(err) + } + } else { + endTime = startTime + } + + return startTime, endTime +} + +// getEventLocation parses the location of the event +func getEventLocation(event RawEvent) string { + building := convert[string](event.Event["location_name"]) + room := convert[string](event.Event["room_number"]) + location := strings.Trim(fmt.Sprintf("%s, %s", building, room), " ,") + + return location } -// toMap attempts to convert data into a map keyed by string. -func toMap(data interface{}) map[string]interface{} { - if dataMap, ok := data.(map[string]interface{}); ok { - return dataMap +// getFilters parses the types, topics, and target audiences +func getFilters(event RawEvent) ([]string, []string, []string) { + types := []string{} + audiences := []string{} + topics := []string{} + + filters := convert[map[string]any](event.Event["filters"]) + + rawTypes := convert[[]any](filters["event_types"]) + for _, rawType := range rawTypes { + types = append(types, convert[string](convert[map[string]any](rawType)["name"])) } - return nil + + rawAudiences := convert[[]any](filters["event_target_audience"]) + for _, audience := range rawAudiences { + audiences = append(audiences, convert[string](convert[map[string]any](audience)["name"])) + } + + rawTopics := convert[[]any](filters["event_topic"]) + for _, topic := range rawTopics { + topics = append(topics, convert[string](convert[map[string]any](topic)["name"])) + } + + return types, audiences, topics } -// toString returns the string form of data or empty string when nil. -func toString(data interface{}) string { - if data != nil { - if dataString, ok := data.(string); ok { - return dataString - } +// getDepartmentsAndTags parses the departments, and tags +func getDepartmentsAndTags(event RawEvent) ([]string, []string) { + departments := []string{} + tags := []string{} + + rawTags := convert[[]any](event.Event["tags"]) + for _, tag := range rawTags { + tags = append(tags, convert[string](tag)) + } + + rawDeparments := convert[[]any](event.Event["departments"]) + for _, deparment := range rawDeparments { + departments = append(departments, convert[string](convert[map[string]any](deparment)["name"])) } - return "" + + return departments, tags } -// parseTime converts an RFC3339 timestamp string to a time.Time. -func parseTime(stringTime string) time.Time { - parsedTime, err := time.Parse(time.RFC3339, stringTime) - if err != nil { - panic(err) +// getContactInfo parses the contact info. +func getContactInfo(event RawEvent) [3]string { + // Note that some events won't have contact phone number + contactInfo := [3]string{} + + rawContactInfo := convert[map[string]any](event.Event["custom_fields"]) + for i, infoField := range []string{ + "contact_information_name", + "contact_information_email", + "contact_information_phone", + } { + contactInfo[i] = convert[string](rawContactInfo[infoField]) + } + + return contactInfo +} + +// convert() attempts to convert data into types for this scraper +func convert[T []any | map[string]any | string](data any) T { + if newTypedData, ok := data.(T); ok { + return newTypedData } - return parsedTime + var zeroValue T + return zeroValue }