Localized 'time ago' parser for YouTube with multiple phrases for each time unit.
This commit is contained in:
parent
a5548b4dc2
commit
77a74a91c9
|
@ -16,6 +16,7 @@ import org.schabi.newpipe.extractor.exceptions.ReCaptchaException;
|
|||
import org.schabi.newpipe.extractor.playlist.PlaylistExtractor;
|
||||
import org.schabi.newpipe.extractor.stream.StreamInfoItemCollector;
|
||||
import org.schabi.newpipe.extractor.stream.StreamType;
|
||||
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
|
||||
import org.schabi.newpipe.extractor.utils.Parser;
|
||||
import org.schabi.newpipe.extractor.utils.Utils;
|
||||
|
||||
|
@ -26,6 +27,8 @@ import javax.annotation.Nonnull;
|
|||
@SuppressWarnings("WeakerAccess")
|
||||
public class YoutubePlaylistExtractor extends PlaylistExtractor<YoutubeService> {
|
||||
|
||||
private final TimeAgoParser timeAgoParser = getService().getTimeAgoParser();
|
||||
|
||||
private Document doc;
|
||||
/**
|
||||
* It's lazily initialized (when getNextStreams is called)
|
||||
|
@ -206,7 +209,7 @@ public class YoutubePlaylistExtractor extends PlaylistExtractor<YoutubeService>
|
|||
continue;
|
||||
}
|
||||
|
||||
collector.commit(new YoutubeStreamInfoItemExtractor(li, null) {
|
||||
collector.commit(new YoutubeStreamInfoItemExtractor(li, timeAgoParser) {
|
||||
public Element uploaderLink;
|
||||
|
||||
@Override
|
||||
|
|
|
@ -8,6 +8,7 @@ import org.schabi.newpipe.extractor.NewPipe;
|
|||
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
|
||||
import org.schabi.newpipe.extractor.search.InfoItemSearchCollector;
|
||||
import org.schabi.newpipe.extractor.search.SearchEngine;
|
||||
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URLEncoder;
|
||||
|
@ -38,8 +39,11 @@ public class YoutubeSearchEngine extends SearchEngine {
|
|||
private static final String TAG = YoutubeSearchEngine.class.toString();
|
||||
public static final String CHARSET_UTF_8 = "UTF-8";
|
||||
|
||||
public YoutubeSearchEngine(int serviceId) {
|
||||
private final TimeAgoParser timeAgoParser;
|
||||
|
||||
public YoutubeSearchEngine(int serviceId, TimeAgoParser timeAgoParser) {
|
||||
super(serviceId);
|
||||
this.timeAgoParser = timeAgoParser;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -107,7 +111,7 @@ public class YoutubeSearchEngine extends SearchEngine {
|
|||
|
||||
// video item type
|
||||
} else if ((el = item.select("div[class*=\"yt-lockup-video\"]").first()) != null) {
|
||||
collector.commit(new YoutubeStreamInfoItemExtractor(el, null));
|
||||
collector.commit(new YoutubeStreamInfoItemExtractor(el, timeAgoParser));
|
||||
} else if ((el = item.select("div[class*=\"yt-lockup-channel\"]").first()) != null) {
|
||||
collector.commit(new YoutubeChannelInfoItemExtractor(el));
|
||||
} else if ((el = item.select("div[class*=\"yt-lockup-playlist\"]").first()) != null &&
|
||||
|
|
|
@ -13,6 +13,8 @@ import org.schabi.newpipe.extractor.stream.StreamExtractor;
|
|||
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.EnumMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/*
|
||||
|
@ -37,13 +39,15 @@ import java.io.IOException;
|
|||
|
||||
public class YoutubeService extends StreamingService {
|
||||
|
||||
private Map<TimeAgoParser.TimeAgoUnit, String[]> timeAgoParserPhrases;
|
||||
|
||||
public YoutubeService(int id, String name) {
|
||||
super(id, name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SearchEngine getSearchEngine() {
|
||||
return new YoutubeSearchEngine(getServiceId());
|
||||
return new YoutubeSearchEngine(getServiceId(), getTimeAgoParser());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -104,12 +108,36 @@ public class YoutubeService extends StreamingService {
|
|||
return list;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the phrases used to parse upload date in the format '2 days ago'.
|
||||
* @param secondsPhrases How to recognize seconds
|
||||
* @param minutesPhrases How to recognize minutes
|
||||
* @param hoursPhrases How to recognize hours
|
||||
* @param daysPhrases How to recognize days
|
||||
* @param weeksPhrases How to recognize weeks
|
||||
* @param monthsPhrases How to recognize months
|
||||
* @param yearsPhrases How to recognize years
|
||||
*/
|
||||
public void setTimeAgoParserPhrases(String[] secondsPhrases, String[] minutesPhrases,
|
||||
String[] hoursPhrases, String[] daysPhrases,
|
||||
String[] weeksPhrases, String[] monthsPhrases,
|
||||
String[] yearsPhrases) {
|
||||
timeAgoParserPhrases = new EnumMap<>(TimeAgoParser.TimeAgoUnit.class);
|
||||
timeAgoParserPhrases.put(TimeAgoParser.TimeAgoUnit.SECONDS, secondsPhrases);
|
||||
timeAgoParserPhrases.put(TimeAgoParser.TimeAgoUnit.MINUTES, minutesPhrases);
|
||||
timeAgoParserPhrases.put(TimeAgoParser.TimeAgoUnit.HOURS, hoursPhrases);
|
||||
timeAgoParserPhrases.put(TimeAgoParser.TimeAgoUnit.DAYS, daysPhrases);
|
||||
timeAgoParserPhrases.put(TimeAgoParser.TimeAgoUnit.WEEKS, weeksPhrases);
|
||||
timeAgoParserPhrases.put(TimeAgoParser.TimeAgoUnit.MONTHS, monthsPhrases);
|
||||
timeAgoParserPhrases.put(TimeAgoParser.TimeAgoUnit.YEARS, yearsPhrases);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @return A helper to parse upload dates in the format '2 days ago'.
|
||||
*
|
||||
* TODO Introduce support for multiple languages.
|
||||
*/
|
||||
TimeAgoParser getTimeAgoParser() {
|
||||
return new TimeAgoParser(TimeAgoParser.DEFAULT_AGO_PHRASES);
|
||||
return new TimeAgoParser(timeAgoParserPhrases == null ?
|
||||
TimeAgoParser.DEFAULT_AGO_PHRASES : timeAgoParserPhrases);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.schabi.newpipe.extractor.stream.StreamInfoItemCollector;
|
|||
import org.schabi.newpipe.extractor.stream.StreamInfoItemExtractor;
|
||||
import org.schabi.newpipe.extractor.stream.StreamType;
|
||||
import org.schabi.newpipe.extractor.stream.SubtitlesFormat;
|
||||
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
|
||||
import org.schabi.newpipe.extractor.stream.VideoStream;
|
||||
import org.schabi.newpipe.extractor.utils.Parser;
|
||||
import org.schabi.newpipe.extractor.utils.Utils;
|
||||
|
@ -87,6 +88,8 @@ public class YoutubeStreamExtractor extends StreamExtractor<YoutubeService> {
|
|||
|
||||
/*//////////////////////////////////////////////////////////////////////////*/
|
||||
|
||||
private final TimeAgoParser timeAgoParser = getService().getTimeAgoParser();
|
||||
|
||||
private Document doc;
|
||||
@Nullable
|
||||
private JsonObject playerArgs;
|
||||
|
@ -812,7 +815,7 @@ public class YoutubeStreamExtractor extends StreamExtractor<YoutubeService> {
|
|||
* This is encapsulated in a StreamInfoItem object, which is a subset of the fields in a full StreamInfo.
|
||||
*/
|
||||
private StreamInfoItemExtractor extractVideoPreviewInfo(final Element li) {
|
||||
return new YoutubeStreamInfoItemExtractor(li, null) {
|
||||
return new YoutubeStreamInfoItemExtractor(li, timeAgoParser) {
|
||||
|
||||
@Override
|
||||
public String getUrl() throws ParsingException {
|
||||
|
|
|
@ -31,6 +31,7 @@ import org.schabi.newpipe.extractor.exceptions.ExtractionException;
|
|||
import org.schabi.newpipe.extractor.exceptions.ParsingException;
|
||||
import org.schabi.newpipe.extractor.kiosk.KioskExtractor;
|
||||
import org.schabi.newpipe.extractor.stream.StreamInfoItemCollector;
|
||||
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -38,7 +39,7 @@ import javax.annotation.Nonnull;
|
|||
|
||||
public class YoutubeTrendingExtractor extends KioskExtractor<YoutubeService> {
|
||||
|
||||
|
||||
private final TimeAgoParser timeAgoParser = getService().getTimeAgoParser();
|
||||
|
||||
private Document doc;
|
||||
|
||||
|
@ -91,7 +92,7 @@ public class YoutubeTrendingExtractor extends KioskExtractor<YoutubeService> {
|
|||
for(Element ul : uls) {
|
||||
for(final Element li : ul.children()) {
|
||||
final Element el = li.select("div[class*=\"yt-lockup-dismissable\"]").first();
|
||||
collector.commit(new YoutubeStreamInfoItemExtractor(li, null) {
|
||||
collector.commit(new YoutubeStreamInfoItemExtractor(li, timeAgoParser) {
|
||||
@Override
|
||||
public String getUrl() throws ParsingException {
|
||||
try {
|
||||
|
|
|
@ -20,9 +20,9 @@ public class TimeAgoParser {
|
|||
* A set of english phrases that are contained in the time units.
|
||||
* (e.g. '7 minutes ago' contains 'min')
|
||||
*/
|
||||
public static Map<TimeAgoUnit, String> DEFAULT_AGO_PHRASES = new EnumMap<>(TimeAgoUnit.class);
|
||||
public static Map<TimeAgoUnit, String[]> DEFAULT_AGO_PHRASES = new EnumMap<>(TimeAgoUnit.class);
|
||||
|
||||
private final Map<TimeAgoUnit, String> agoPhrases;
|
||||
private final Map<TimeAgoUnit, String[]> agoPhrases;
|
||||
|
||||
private final Calendar consistentNow;
|
||||
|
||||
|
@ -33,73 +33,82 @@ public class TimeAgoParser {
|
|||
* </p>
|
||||
* @param agoPhrases A set of phrases how to recognize the time units in a given language.
|
||||
*/
|
||||
public TimeAgoParser(Map<TimeAgoUnit, String> agoPhrases) {
|
||||
public TimeAgoParser(Map<TimeAgoUnit, String[]> agoPhrases) {
|
||||
this.agoPhrases = agoPhrases;
|
||||
consistentNow = Calendar.getInstance();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a textual date in the format '2 days ago' into a Calendar representation.
|
||||
* Sets the time to the beginning of the day/week/month/year if no exact time is available.
|
||||
* @param textualDate The original date as provided by the streaming service
|
||||
* @return The parsed (approximated) time
|
||||
* @throws ParsingException if the time unit could not be recognized
|
||||
*/
|
||||
public Calendar parse(String textualDate) throws ParsingException {
|
||||
try {
|
||||
int timeAgoValue = parseTimeAgoValue(textualDate);
|
||||
int timeAgoAmount = parseTimeAgoAmount(textualDate);
|
||||
TimeAgoUnit timeAgoUnit = parseTimeAgoUnit(textualDate);
|
||||
|
||||
return getCalendar(timeAgoValue, timeAgoUnit);
|
||||
return getCalendar(timeAgoAmount, timeAgoUnit);
|
||||
} catch (NumberFormatException e) {
|
||||
// If there is no valid number in the textual date, assume it is 'moments ago'.
|
||||
return getCalendar(0, TimeAgoUnit.SECONDS);
|
||||
}
|
||||
}
|
||||
|
||||
private int parseTimeAgoValue(String textualDate) throws NumberFormatException {
|
||||
private int parseTimeAgoAmount(String textualDate) throws NumberFormatException {
|
||||
String timeValueStr = textualDate.replaceAll("\\D+", "");
|
||||
return Integer.parseInt(timeValueStr);
|
||||
}
|
||||
|
||||
private TimeAgoUnit parseTimeAgoUnit(String textualDate) throws ParsingException {
|
||||
for (TimeAgoUnit timeAgoUnit : agoPhrases.keySet()) {
|
||||
if (textualDate.contains(agoPhrases.get(timeAgoUnit))) {
|
||||
return timeAgoUnit;
|
||||
for (String agoPhrase : agoPhrases.get(timeAgoUnit)) {
|
||||
if (textualDate.toLowerCase().contains(agoPhrase.toLowerCase())){
|
||||
return timeAgoUnit;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw new ParsingException("Unable to parse the date: " + textualDate);
|
||||
}
|
||||
|
||||
private Calendar getCalendar(int timeAgoValue, TimeAgoUnit timeAgoUnit) {
|
||||
private Calendar getCalendar(int timeAgoAmount, TimeAgoUnit timeAgoUnit) {
|
||||
Calendar calendarTime = getNow();
|
||||
|
||||
switch (timeAgoUnit) {
|
||||
case SECONDS:
|
||||
calendarTime.add(Calendar.SECOND, -timeAgoValue);
|
||||
calendarTime.add(Calendar.SECOND, -timeAgoAmount);
|
||||
break;
|
||||
|
||||
case MINUTES:
|
||||
calendarTime.add(Calendar.MINUTE, -timeAgoValue);
|
||||
calendarTime.add(Calendar.MINUTE, -timeAgoAmount);
|
||||
break;
|
||||
|
||||
case HOURS:
|
||||
calendarTime.add(Calendar.HOUR_OF_DAY, -timeAgoValue);
|
||||
calendarTime.add(Calendar.HOUR_OF_DAY, -timeAgoAmount);
|
||||
break;
|
||||
|
||||
case DAYS:
|
||||
calendarTime.add(Calendar.DAY_OF_MONTH, -timeAgoValue);
|
||||
calendarTime.add(Calendar.DAY_OF_MONTH, -timeAgoAmount);
|
||||
resetTimeOfDay(calendarTime);
|
||||
break;
|
||||
|
||||
case WEEKS:
|
||||
calendarTime.add(Calendar.WEEK_OF_MONTH, -timeAgoValue);
|
||||
calendarTime.add(Calendar.WEEK_OF_YEAR, -timeAgoAmount);
|
||||
calendarTime.set(Calendar.DAY_OF_WEEK, calendarTime.getFirstDayOfWeek());
|
||||
resetTimeOfDay(calendarTime);
|
||||
break;
|
||||
|
||||
case MONTHS:
|
||||
calendarTime.add(Calendar.MONTH, -timeAgoValue);
|
||||
calendarTime.add(Calendar.MONTH, -timeAgoAmount);
|
||||
calendarTime.set(Calendar.DAY_OF_MONTH, 1);
|
||||
resetTimeOfDay(calendarTime);
|
||||
break;
|
||||
|
||||
case YEARS:
|
||||
calendarTime.add(Calendar.YEAR, -timeAgoValue);
|
||||
calendarTime.add(Calendar.YEAR, -timeAgoAmount);
|
||||
calendarTime.set(Calendar.MONTH, 1);
|
||||
calendarTime.set(Calendar.DAY_OF_MONTH, 1);
|
||||
resetTimeOfDay(calendarTime);
|
||||
|
@ -117,19 +126,20 @@ public class TimeAgoParser {
|
|||
calendarTime.set(Calendar.HOUR_OF_DAY, 0);
|
||||
calendarTime.set(Calendar.MINUTE, 0);
|
||||
calendarTime.set(Calendar.SECOND, 0);
|
||||
calendarTime.set(Calendar.MILLISECOND, 0);
|
||||
}
|
||||
|
||||
static {
|
||||
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.SECONDS, "sec");
|
||||
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.MINUTES, "min");
|
||||
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.HOURS, "hour");
|
||||
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.DAYS, "day");
|
||||
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.WEEKS, "week");
|
||||
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.MONTHS, "month");
|
||||
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.YEARS, "year");
|
||||
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.SECONDS, new String[]{"sec"});
|
||||
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.MINUTES, new String[]{"min"});
|
||||
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.HOURS, new String[]{"hour"});
|
||||
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.DAYS, new String[]{"day"});
|
||||
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.WEEKS, new String[]{"week"});
|
||||
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.MONTHS, new String[]{"month"});
|
||||
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.YEARS, new String[]{"year"});
|
||||
}
|
||||
|
||||
enum TimeAgoUnit {
|
||||
public enum TimeAgoUnit {
|
||||
SECONDS,
|
||||
MINUTES,
|
||||
HOURS,
|
||||
|
|
|
@ -9,6 +9,7 @@ import org.schabi.newpipe.extractor.NewPipe;
|
|||
import org.schabi.newpipe.extractor.channel.ChannelInfoItem;
|
||||
import org.schabi.newpipe.extractor.search.SearchEngine;
|
||||
import org.schabi.newpipe.extractor.search.SearchResult;
|
||||
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
|
@ -45,7 +46,8 @@ public class YoutubeSearchEngineAllTest extends BaseYoutubeSearchTest {
|
|||
@BeforeClass
|
||||
public static void setUpClass() throws Exception {
|
||||
NewPipe.init(Downloader.getInstance());
|
||||
YoutubeSearchEngine engine = new YoutubeSearchEngine(1);
|
||||
YoutubeSearchEngine engine = new YoutubeSearchEngine(1,
|
||||
new TimeAgoParser(TimeAgoParser.DEFAULT_AGO_PHRASES));
|
||||
|
||||
result = engine.search("pewdiepie", 0, "de", SearchEngine.Filter.ANY)
|
||||
.getSearchResult();
|
||||
|
|
Loading…
Reference in New Issue